In [1]:
import numpy as np
import pandas as pd
import unicodedata
from nltk.corpus import stopwords

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

In [2]:
ruta_data = "../data"
ruta_train = ruta_data + "/train.csv"
ruta_test = ruta_data + "/test.csv"

In [3]:
def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

In [4]:
stop_words_sp = set(stopwords.words('spanish'))
aux = set()
for word in stop_words_sp:
    aux.add(strip_accents(word))
    
stop_words_sp = stop_words_sp.union(aux)

In [5]:
descr_train = pd.read_csv(f"{ruta_data}/descripciones_train_limpias.csv")

In [6]:
descr_train = descr_train.fillna("")

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   stop_words=stop_words_sp,max_features=300)

In [8]:
term_doc_m = tfidf_vectorizer.fit_transform(descr_train["descripcion"])

In [28]:
svd = TruncatedSVD(100)

In [29]:
mat_conceptos = svd.fit_transform(term_doc_m)

In [32]:
conceptos_train = pd.DataFrame(mat_conceptos, columns=(f"lsa{i}" for i in range(100)))

In [35]:
conceptos_train.to_csv(f"{ruta_data}/features/TFIDF_descripciones_train_500.csv", index=False)

---

In [9]:
tfidf_vect_train = pd.DataFrame(term_doc_m.todense(), columns=(f"tf{i}" for i in range(300)))

In [10]:
tfidf_vect_train.to_csv(f"{ruta_data}/features/tfidf_vect_train_300.csv",index=False)