In [41]:
from os import makedirs, path

import joblib
import nltk
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


from scripts.linear_classifiers_functions import Tokenizer, Embedder


In [33]:
!pip freeze > requirements.txt

In [21]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmart130\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jmart130\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jmart130\AppData\Roaming\nltk_data...


True

In [26]:
# file to generate all files
OUTPUT_DIR = "./model/"

## TRAINING
# training file
TRAINING_INPUT_FILE = "./training_data.csv"
# text sequences columns to read
TRAINING_INPUT_COLUMN = "text"
# column to predict
TRAINING_LABEL_COLUMN = "label"

In [16]:

## PREDICTION
# path to prediction file
TESTING_INPUT_FILE = "./training_data.csv"
# column  with text sequences to predict
TESTING_INPUT_COLUMN = "paragraph"
# embedding mode 
#   'ohe':  One Hot Encoding
#   'tf':   Term Frecuency
#   'tfidf: TF-IDF
TESTING_LABEL_COLUMN = "label"

EMBEDDING_METHODS = [
    'ohe',
    'tf',
    'tfidf'  
    ]

In [27]:
#loading data
df_original = pd.read_csv(TRAINING_INPUT_FILE)
categories = list(np.unique(df_original[TRAINING_LABEL_COLUMN]))

# ETL
df = df_original.copy()
tokenizer = Tokenizer()
tokenizer.fit(list(df[TRAINING_INPUT_COLUMN]),
              min_token_aparison=5,
              stopwords= list(nltk.corpus.stopwords.words('english'))
)
# save the tokenizer 
if(not path.exists(OUTPUT_DIR)):
  makedirs(OUTPUT_DIR)
  
tokenizer.save("{}/tokenizer.pickle".format(OUTPUT_DIR))


In [29]:
embedders = {}
embedder = Embedder(tokenizer.VOCAB)

df["tokenized"] = tokenizer.transform(list(df[TRAINING_INPUT_COLUMN]))

for method in EMBEDDING_METHODS:
  print(method)
  embedders[method] = embedder
  embedders[method].fit(list(df["tokenized"]),mode=method)
  df[method] = list(embedders[method].transform(list(df["tokenized"])))
  # save the embedders
  embedders[method].save("{}/embedder_{}.pickle".format(OUTPUT_DIR,method))
  


ohe
tf
tfidf


In [40]:
train = df
classifiers = {}
mode = 'tfidf'

model_name = "RFC"

classifiers[model_name] = RandomForestClassifier(
    random_state=0)

classifiers[model_name].fit(
    X= list(train[mode]),
    y= list(train[TRAINING_LABEL_COLUMN])
)

In [43]:
model_name = "KNN"

classifiers[model_name] = KNeighborsClassifier(
    n_neighbors=5)


classifiers[model_name].fit(
    X= list(train[mode]),
    y= list(train[TRAINING_LABEL_COLUMN])
)

In [44]:
for model in classifiers:
  joblib.dump(classifiers[model],"{}/{}_{}.joblib".format(OUTPUT_DIR,model,method),compress=9)