In [15]:
import pandas as pd
import os, pickle
from IPython.display import display
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(x):
    return RegexpTokenizer(r'\w+').tokenize(x.lower())

def removeStopwords(x):
    with open("stopWords_es.txt") as f:
        text = f.read()
        prohibitedWords = text.split("\n")
        return [word for word in x if not word in prohibitedWords]

def stemming(x):
    stemmer = SnowballStemmer(language="spanish")
    return ' '.join([stemmer.stem(word) for word in x])

def generateDF(path):
    df = pd.DataFrame({"name": [], "path": [], "content":[]})
    files = os.listdir(path)
    for file in files:
        file_path = os.path.join(path, file)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding="ISO-8859-1") as f:
                df = df.append({"name": file, "path": file_path, "content": f.read()}, ignore_index=True)
        elif os.path.isdir(file_path):
            df = df.append(generateDF(file_path))

    return df

df = pd.DataFrame({"name": [], "path": [], "content":[]})
df = df.append(generateDF(os.path.join(os.getcwd(), "elMundo")))
df = df.append(generateDF(os.path.join(os.getcwd(), "elPais")))
df = df.append(generateDF(os.path.join(os.getcwd(), "20minutos")))
df['tokens'] = df['content'].map(tokenize)
df['tokens'] = df['tokens'].map(removeStopwords)
df['lemma'] = df['tokens'].map(stemming)

df["medio"] = df["path"].apply(lambda x: x.split("/")[len(x.split("/"))-3])
df["categoria"] = df["path"].apply(lambda x: x.split("/")[len(x.split("/"))-2])
df["etiquettas"] = df["content"].apply(lambda x: x.split("\n")[0])

display(df)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['lemma'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df['vector'] = denselist
csv = df[['name','path', 'medio', 'categoria', 'etiquettas', 'vector']]
csv.to_csv("vectores.csv", index=False)
pickle.dump(vectorizer, open("vectorizer.file", 'wb'))
display(csv)


Unnamed: 0,name,path,content,tokens,lemma,medio,categoria,etiquettas
0,.DS_Store,/Users/joelplambeck/Documents/sistemas-intelig...,���Bud1����������������������������������...,"[bud1, avsrnlong, dsdb, à]",bud1 avsrnlong dsdb à,sistemas-inteligentes_codigo,elMundo,���Bud1����������������������������������...
0,salud.2021-12-07.001.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Ciencia y Salud, Vacunas, Covid 19, Coronaviru...","[ciencia, salud, vacunas, covid, 19, coronavir...",cienci salud vacun cov 19 coronavirus ocupaciã...,elMundo,salud,"Ciencia y Salud, Vacunas, Covid 19, Coronavirus"
1,salud.2021-12-06.004.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Coronavirus, Ciencia y Salud, Covid 19, Vacuna...","[coronavirus, ciencia, salud, covid, 19, vacun...",coronavirus cienci salud cov 19 vacun variant ...,elMundo,salud,"Coronavirus, Ciencia y Salud, Covid 19, Vacuna..."
2,salud.2021-12-07.002.txt,/Users/joelplambeck/Documents/sistemas-intelig...,Ciencia y Salud\n\nDemostrado: sÃ­ es posible ...,"[ciencia, salud, demostrado, sã, evitar, peso,...",cienci salud demostr sã evit pes nac 30 embara...,elMundo,salud,Ciencia y Salud
3,salud.2021-12-07.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,Ciencia y Salud\n\nAsÃ­ serÃ¡ un menÃº del fut...,"[ciencia, salud, asã, serã, menãº, futuro, hue...",cienci salud asã serã menãº futur huev sabor p...,elMundo,salud,Ciencia y Salud
...,...,...,...,...,...,...,...,...
30,ciencia.2022-01-08.004.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"CocaÃ­na, Drogas, AzÃºcar\n\nQuÃ© tienen en co...","[cocaã, na, drogas, azãºcar, quã, comãºn, azãº...",cocaã na drog azãºcar quã comãºn azãºcar cocaã...,20minutos,ciencia,"CocaÃ­na, Drogas, AzÃºcar"
31,ciencia.2022-01-11.006.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"AstronomÃ­a, Sol\n\nCientÃ­ficos observan por ...","[astronomã, sol, cientã, ficos, observan, cã³m...",astronomã sol cientã fic observ cã³mo exoplane...,20minutos,ciencia,"AstronomÃ­a, Sol"
32,ciencia.2022-01-10.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Software Libre, ArqueologÃ­a, Rueda de prensa,...","[software, libre, arqueologã, rueda, prensa, a...",softwar libr arqueologã rued prens agu cã diz ...,20minutos,ciencia,"Software Libre, ArqueologÃ­a, Rueda de prensa,..."
33,ciencia.2022-01-07.009.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"EE UU, NASA, Astronautas, AeronÃ¡utica, Vivo\n...","[ee, uu, nasa, astronautas, aeronã, utica, viv...",ee uu nas astronaut aeronã utic viv nas lanz t...,20minutos,ciencia,"EE UU, NASA, Astronautas, AeronÃ¡utica, Vivo"


Unnamed: 0,name,path,medio,categoria,etiquettas,vector
0,.DS_Store,/Users/joelplambeck/Documents/sistemas-intelig...,sistemas-inteligentes_codigo,elMundo,���Bud1����������������������������������...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,salud.2021-12-07.001.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"Ciencia y Salud, Vacunas, Covid 19, Coronavirus","[0.0, 0.05671864040784504, 0.0, 0.0, 0.0, 0.0,..."
1,salud.2021-12-06.004.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"Coronavirus, Ciencia y Salud, Covid 19, Vacuna...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,salud.2021-12-07.002.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,Ciencia y Salud,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,salud.2021-12-07.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,Ciencia y Salud,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
30,ciencia.2022-01-08.004.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"CocaÃ­na, Drogas, AzÃºcar","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
31,ciencia.2022-01-11.006.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"AstronomÃ­a, Sol","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
32,ciencia.2022-01-10.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"Software Libre, ArqueologÃ­a, Rueda de prensa,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
33,ciencia.2022-01-07.009.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"EE UU, NASA, Astronautas, AeronÃ¡utica, Vivo","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
path = "/Users/joelplambeck/Documents/sistemas-inteligentes_codigo/elPais/ciencia/ciencia.2021-03-01.001.txt"
parts = path.split("/")
print(parts[len(parts)-2])

ciencia
