# Documentation

# Imports

In [1]:
import marvin_redeneural_engine
from marvin_python_toolbox.common.data import MarvinData
import sys 
!{sys.executable} -m pip install nltk 
!{sys.executable} -m pip install sklearn

You should consider upgrading via the '/home/celesde/.virtualenvs/redeneural-engine-env/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/celesde/.virtualenvs/redeneural-engine-env/bin/python -m pip install --upgrade pip' command.[0m


# Acquisitor and Cleaning

In [2]:
from marvin_python_toolbox.common.data import MarvinData
import pandas as pd
import nltk
from nltk.corpus import stopwords

products_data = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"), delimiter=';', encoding='utf-8')
# concatenando as colunas nome e descricao
products_data['informacao'] = products_data['nome'] + products_data['descricao']
# excluindo linhas com valor de informacao ou categoria NaN
products_data.dropna(subset=['informacao', 'categoria'], inplace=True)
products_data.drop(columns=['nome', 'descricao'], inplace=True)

stop_words=set(stopwords.words("portuguese"))
# transforma a string em caixa baixa e remove stopwords
products_data['sem_stopwords'] = products_data['informacao'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

tokenizer = nltk.RegexpTokenizer(r"\w+")
products_data['tokens'] = products_data['sem_stopwords'].apply(tokenizer.tokenize) # aplica o regex tokenizer
products_data.drop(columns=['sem_stopwords','informacao'],inplace=True) # Exclui as colunas antigas

products_data["strings"]= products_data["tokens"].str.join(" ") # reunindo cada elemento da lista
products_data.head()
marvin_initial_dataset = products_data


# Traning Preparator

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train,X_test,y_train,y_test = train_test_split( # Separação dos dados para teste e treino
    marvin_initial_dataset["strings"], 
    marvin_initial_dataset["categoria"], 
    test_size = 0.2, 
    random_state = 10
)

# Converte as strings para uma matriz de contagem dos tokens
vect = TfidfVectorizer()
vect.fit(marvin_initial_dataset["strings"])

marvin_dataset = { 
    "X_train": vect.fit_transform(X_train), # extrai o vocabulário dos dados 
    "X_test": vect.transform(X_test),  # transforma os dados de acordo com o vocabulário
    "y_train": y_train,
    "y_test" : y_test,
    "vect": vect
}

# Trainer

In [4]:
# Uma rede neural Perceptron Multicamadas
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, ), random_state=1, verbose=True)
clf.fit(marvin_dataset["X_train"], marvin_dataset["y_train"]) # Treino do classificador

marvin_model = {
    "clf" : clf,
    "vect": marvin_dataset["vect"]
}


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      2249174     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38741D+00    |proj g|=  5.36022D-02


 This problem is unconstrained.



At iterate    1    f=  1.38674D+00    |proj g|=  1.24442D-01

At iterate    2    f=  1.34109D+00    |proj g|=  3.04135D-02

At iterate    3    f=  1.30997D+00    |proj g|=  2.98266D-02

At iterate    4    f=  1.23044D+00    |proj g|=  1.01605D-01

At iterate    5    f=  1.05492D+00    |proj g|=  1.85251D-01

At iterate    6    f=  5.35893D-01    |proj g|=  3.29356D-01

At iterate    7    f=  4.53995D-01    |proj g|=  2.98525D-01

At iterate    8    f=  3.65029D-01    |proj g|=  8.47277D-02

At iterate    9    f=  3.34068D-01    |proj g|=  9.19654D-02

At iterate   10    f=  2.91214D-01    |proj g|=  9.74550D-02

At iterate   11    f=  2.50417D-01    |proj g|=  6.39296D-02

At iterate   12    f=  1.91825D-01    |proj g|=  3.40539D-02

At iterate   13    f=  1.44194D-01    |proj g|=  2.22013D-02

At iterate   14    f=  9.15443D-02    |proj g|=  2.46002D-02

At iterate   15    f=  4.18777D-02    |proj g|=  1.30147D-02

At iterate   16    f=  3.17063D-02    |proj g|=  5.30210D-02

At iter

# Metrics Evaluator

In [5]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
# Multinomial NB
y_prediction = marvin_model["clf"].predict(marvin_dataset["X_test"])
accuracy = accuracy_score(y_prediction, marvin_dataset["y_test"])

print(accuracy)
marvin_metrics = accuracy

0.9845890410958904


In [6]:
input_message = ["Figura Transformers Prime War Deluxe - E9687 - Hasbro"]

# Prediction Preparator

In [7]:
print(input_message)
input_message = marvin_model["vect"].transform(input_message)

['Figura Transformers Prime War Deluxe - E9687 - Hasbro']


# Predictor

In [8]:
final_prediction = marvin_model["clf"].predict(input_message)[0]
print("Predicted value: " + final_prediction)

Predicted value: brinquedo
