In [69]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# seed
np.random.seed(0)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hugo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hugo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
train = pd.read_csv('ml-desafio/train.csv', sep=",")
x_test = pd.read_csv('ml-desafio/x_test.csv', sep=",")

print(train.columns)
print(x_test.columns)
print("")

print(train.isna().sum())
print(x_test.isna().sum())

Index(['title', 'category'], dtype='object')
Index(['Id', 'title', 'Predicted'], dtype='object')

title       0
category    0
dtype: int64
Id               0
title            0
Predicted    71461
dtype: int64


In [71]:
tokenizer = TfidfVectorizer().build_tokenizer()#Return a function that splits a string into a sequence of tokens considering unicode characters
stemmer = SnowballStemmer("spanish") 
lemmatizer = WordNetLemmatizer()

##stop-words: el vectorizador ya las remueve pero para mostrar las palabras más frecuente tiene sentido activarlo
remove_stops_here = False

def pre_processing(text):
    results = []
    for token in tokenizer(text):
        clean_token = token.lower().strip().strip('-').strip('_')
        if remove_stops_here and (clean_token in stopwords.words('spanish')):
            continue
        #token_pro = stemmer.stem(clean_token) #podemos probar stemming en vez de lematizacion
        token_pro = lemmatizer.lemmatize(clean_token) 
        if len(token_pro) > 2 and not token_pro[0].isdigit(): #elimina palabra largo menor a 2
            results.append(token_pro)
    return results

train["new_title"] = train["title"].apply(pre_processing)
print(train["new_title"])

lr = LogisticRegression()
#lr.fit(train["title"], train["category"])

0         [bikini, malla, lunares, colores, colección, v...
1                          [kit, bujias, ngk, ford, bp5efs]
2         [campera, rompeviento, adidas, original, hard,...
3                     [vulk, danner, sblk, sg91, polarized]
4                                 [calza, frizada, termica]
                                ...                        
107186    [memoria, ddr4, kingston, cl15, hyperx, fury, ...
107187                [mallas, talles, grandes, importadas]
107188        [kit, bujias, originales, bmw, e53, m62, b46]
107189    [reloj, casio, unisex, retro, vintage, metal, ...
107190    [reloj, montreal, mujer, ml369, tienda, oficia...
Name: new_title, Length: 107191, dtype: object


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

spanish_stopwords = stopwords.words('spanish')

max_features = 10000
max_df=0.8
min_df=2

### NUMBER OF N GRAMS TO CONSIDER
ngram_max = 1

#vectorizer = TfidfVectorizer(stop_words=spanish_stopwords,tokenizer=pre_processing,min_df=min_df, max_df=max_df, max_features=max_features,
 #                             binary=False, use_idf=True, smooth_idf=True, norm=None, ngram_range=(1, ngram_max))

vectorizer = CountVectorizer(stop_words=spanish_stopwords,tokenizer=pre_processing,min_df=min_df, max_df=max_df, max_features=max_features,
                              binary=True, ngram_range=(1, ngram_max))
#CountVectorizer
#TfidfVectorizer

vectorizer.fit(train['title'])
x_train_binary = vectorizer.transform(train['title'])

print(x_train_binary.shape)

(107191, 10000)


In [73]:
X = x_train_binary
Y = train["category"]

lr = LogisticRegression()
lr.fit(X, Y)

X_test = vectorizer.transform(x_test['title'])
print(lr.score(X, Y))
Y_predicted = lr.predict(X_test)

print(Y_predicted)
print(x_test["Id"])

0.9877974829976397
['NECKLACES' 'MICROPHONES' 'SMARTWATCHES' ... 'COMPUTER_PROCESSORS'
 'MOTORCYCLE_HELMETS' 'SPARK_PLUGS']
0            0
1            1
2            2
3            3
4            4
         ...  
71456    71456
71457    71457
71458    71458
71459    71459
71460    71460
Name: Id, Length: 71461, dtype: int64


In [75]:
print(x_test["Id"], type(x_test))
y = pd.DataFrame(data=Y_predicted, columns=["Predicted"])
print(y)
y_predicted = pd.concat([x_test["Id"], y], axis=1)
print(y_predicted)
y_predicted.to_csv("sample_submission.csv", index=False)

0            0
1            1
2            2
3            3
4            4
         ...  
71456    71456
71457    71457
71458    71458
71459    71459
71460    71460
Name: Id, Length: 71461, dtype: int64 <class 'pandas.core.frame.DataFrame'>
                 Predicted
0                NECKLACES
1              MICROPHONES
2             SMARTWATCHES
3       MOTORCYCLE_HELMETS
4                    PANTS
...                    ...
71456     DIECAST_VEHICLES
71457   RAM_MEMORY_MODULES
71458  COMPUTER_PROCESSORS
71459   MOTORCYCLE_HELMETS
71460          SPARK_PLUGS

[71461 rows x 1 columns]
          Id            Predicted
0          0            NECKLACES
1          1          MICROPHONES
2          2         SMARTWATCHES
3          3   MOTORCYCLE_HELMETS
4          4                PANTS
...      ...                  ...
71456  71456     DIECAST_VEHICLES
71457  71457   RAM_MEMORY_MODULES
71458  71458  COMPUTER_PROCESSORS
71459  71459   MOTORCYCLE_HELMETS
71460  71460          SPARK_PLUGS

