In [19]:
import pandas as pd
import os, pickle
from IPython.display import display
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(x):
    return RegexpTokenizer(r'\w+').tokenize(x.lower())

def removeStopwords(x):
    with open("stopWords_es.txt") as f:
        text = f.read()
        prohibitedWords = text.split("\n")
        return [word for word in x if not word in prohibitedWords]

def stemming(x):
    stemmer = SnowballStemmer(language="spanish")
    return ' '.join([stemmer.stem(word) for word in x])

def generateDF(path):
    df = pd.DataFrame({"name": [], "path": [], "content":[]})
    files = os.listdir(path)
    for file in files:
        if file.startswith("."): continue
        file_path = os.path.join(path, file)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding="ISO-8859-1") as f:
                df = df.append({"name": file, "path": file_path, "content": f.read()}, ignore_index=True)
        elif os.path.isdir(file_path):
            df = df.append(generateDF(file_path))

    return df

def splitall(path):
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts

df = pd.DataFrame({"name": [], "path": [], "content":[]})
df = df.append(generateDF(os.path.join(os.getcwd(), "elMundo")))
df = df.append(generateDF(os.path.join(os.getcwd(), "elPais")))
df = df.append(generateDF(os.path.join(os.getcwd(), "20minutos")))
df['tokens'] = df['content'].map(tokenize)
df['tokens'] = df['tokens'].map(removeStopwords)
df['lemma'] = df['tokens'].map(stemming)

df["medio"] = df["path"].apply(lambda x: splitall(x)[len(splitall(x))-3])
df["categoria"] = df["path"].apply(lambda x: splitall(x)[len(splitall(x))-2])
df["etiquettas"] = df["content"].apply(lambda x: x.split("\n")[0])

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['lemma'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df['vector'] = denselist
csv = df[['name','path', 'medio', 'categoria', 'etiquettas', 'vector']]
csv.to_csv("vectores.csv", index=False)
pickle.dump(vectorizer, open("vectorizer.file", 'wb'))
display(csv)


Unnamed: 0,name,path,content,tokens,lemma,medio,categoria,etiquettas
0,elMundo.salud.2022-01-17.004.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Ciencia y Salud, Covid 19, Coronavirus, Varian...","[ciencia, salud, covid, 19, coronavirus, varia...",cienci salud cov 19 coronavirus variant ã micr...,elMundo,salud,"Ciencia y Salud, Covid 19, Coronavirus, Varian..."
1,elMundo.salud.2022-01-21.006.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Coronavirus, Estados Unidos, ciencia, Covid 19...","[coronavirus, unidos, ciencia, covid, 19, mari...",coronavirus unid cienci cov 19 marihuan cannab...,elMundo,salud,"Coronavirus, Estados Unidos, ciencia, Covid 19..."
2,elMundo.salud.2022-01-20.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"HBPR, Coronavirus, Covid 19\n\nQuÃ© quiere dec...","[hbpr, coronavirus, covid, 19, quã, realmente,...",hbpr coronavirus cov 19 quã realment conviv vi...,elMundo,salud,"HBPR, Coronavirus, Covid 19"
3,elMundo.salud.2022-01-20.002.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Coronavirus, Covid 19, Ciencia y Salud, Varian...","[coronavirus, covid, 19, ciencia, salud, varia...",coronavirus cov 19 cienci salud variant ã micr...,elMundo,salud,"Coronavirus, Covid 19, Ciencia y Salud, Varian..."
4,elMundo.salud.2022-01-21.005.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"Coronavirus, Covid 19, CardiologÃ­a\n\nEl daÃ±...","[coronavirus, covid, 19, cardiologã, daã, card...",coronavirus cov 19 cardiologã daã cardã aco pa...,elMundo,salud,"Coronavirus, Covid 19, CardiologÃ­a"
...,...,...,...,...,...,...,...,...
30,20minutos.ciencia.2022-01-18.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"NASA, Asteroides\n\nHorario y cÃ³mo ver el ast...","[nasa, asteroides, horario, cã³mo, asteroide, ...",nas asteroid horari cã³mo asteroid potencial p...,20minutos,ciencia,"NASA, Asteroides"
31,20minutos.ciencia.2022-01-18.002.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"NASA, La Tierra, Asteroides\n\nQuÃ© posibilida...","[nasa, tierra, asteroides, quã, posibilidades,...",nas tierr asteroid quã posibil asteroid potenc...,20minutos,ciencia,"NASA, La Tierra, Asteroides"
32,20minutos.ciencia.2022-01-19.005.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"UPV, Gripe, Coronavirus, Covid-19, Virus respi...","[upv, gripe, coronavirus, covid, 19, virus, re...",upv grip coronavirus cov 19 virus respiratori ...,20minutos,ciencia,"UPV, Gripe, Coronavirus, Covid-19, Virus respi..."
33,20minutos.ciencia.2022-01-18.001.txt,/Users/joelplambeck/Documents/sistemas-intelig...,"EE UU, Vacunas, Placebo, SÃ­ntomas, Covid-19, ...","[ee, uu, vacunas, placebo, sã, ntomas, covid, ...",ee uu vacun placeb sã ntom cov 19 pfiz efect n...,20minutos,ciencia,"EE UU, Vacunas, Placebo, SÃ­ntomas, Covid-19, ..."


Unnamed: 0,name,path,medio,categoria,etiquettas,vector
0,elMundo.salud.2022-01-17.004.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"Ciencia y Salud, Covid 19, Coronavirus, Varian...","[0.0, 0.06477782528561343, 0.0, 0.0, 0.0, 0.0,..."
1,elMundo.salud.2022-01-21.006.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"Coronavirus, Estados Unidos, ciencia, Covid 19...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,elMundo.salud.2022-01-20.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"HBPR, Coronavirus, Covid 19","[0.0, 0.01580797794592745, 0.0, 0.0, 0.0, 0.0,..."
3,elMundo.salud.2022-01-20.002.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"Coronavirus, Covid 19, Ciencia y Salud, Varian...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,elMundo.salud.2022-01-21.005.txt,/Users/joelplambeck/Documents/sistemas-intelig...,elMundo,salud,"Coronavirus, Covid 19, CardiologÃ­a","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...
30,20minutos.ciencia.2022-01-18.003.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"NASA, Asteroides","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
31,20minutos.ciencia.2022-01-18.002.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"NASA, La Tierra, Asteroides","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
32,20minutos.ciencia.2022-01-19.005.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"UPV, Gripe, Coronavirus, Covid-19, Virus respi...","[0.0, 0.025808406542510466, 0.0, 0.0, 0.0, 0.0..."
33,20minutos.ciencia.2022-01-18.001.txt,/Users/joelplambeck/Documents/sistemas-intelig...,20minutos,ciencia,"EE UU, Vacunas, Placebo, SÃ­ntomas, Covid-19, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [29]:
import os, sys
def splitall(path):
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts
path = "/Users/joelplambeck/Documents/sistemas-inteligentes_codigo/elPais/ciencia/ciencia.2021-03-01.001.txt"
parts = path.split("/")
print(parts[len(parts)-2])
print(splitall(path))

ciencia
['/', 'Users', 'joelplambeck', 'Documents', 'sistemas-inteligentes_codigo', 'elPais', 'ciencia', 'ciencia.2021-03-01.001.txt']
