## Functions: Models and cleaning data

### Prepare and clean the data


Function to vectorize the corpus

Two tipes of vectorization process

     - vectorizer1: matrix counting words. 

     - vectorizer2: matrix  counting words but penalizing repeated words (IDF)

Clean data

    - Tildes: It takes the string and change a character with a tilde for a character without it.
    -preprocessor: It takes all the emoticons apart from the aphabetic characters.
    -tokenize: Split the text using whitespace


In [6]:
## Basics
import pandas as pd
import re
import os
from joblib import dump, load
#import text_cleaning  # Module in directory. 
## Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize, word_tokenize
## Pipelines/Model Evaluation
from sklearn.model_selection import GridSearchCV #
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
## Classifiers
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
from nltk.corpus import stopwords 
stop_es= stopwords.words("spanish")

vectorizer1 = CountVectorizer(lowercase=True, preprocessor=None)
vectorizer2 = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

def myTokenizer(sentence):
        newSent = []
        for word in list(word_tokenize(sentence)):
            if len(word)>1 or word[0].isalpha():
                newSent.append(word)
        return newSent


snowballStemmer = SnowballStemmer('spanish')
# Defining language for snowball class

def tokenize_snowball(text):
    return [snowballStemmer.stem(word) for word in text.split()]
# Stem each word


def tildes(texto):
	'''
	eliminates the tildes (especial problematic in spanish), e.g = from é to e
	imput= text
	output= text without tildes
	'''
	dictionary = {'á':'a','é':'e','í':'i','ó':'o','ú':'u'}
	temp=texto
	for i in list(dictionary):
		temp=re.sub(i,dictionary.pop(i),temp)
	return(temp)

def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = re.sub("[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
    return(text)

def tokenize(text):
    return text.split()

## Models 

I create a pipeline for different classification models. Using different hyperparameters.
The parameter grid has multiple sets of parameters for vectorizing and the ML model.
The aim is to compare the results from different parameters's configuration. 


### Naive Bayes

In [9]:

# Defining the models that will be used
pip1 = Pipeline([("vect", vectorizer1),
                # Countvectorizer : Tokenize strings
                ("clf", BernoulliNB(alpha=.01))])
                # BernoulliNB : Model for binary data

# Defining sets of parameters for the models
param_grid_nb = [{  "vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,tokenize_snowball],
                    # None : Override tokenization
                    # myTokenizer : Use the function myTokenizer
                    # tokenize_snowball : Use the function tokenize_snowball
                    "vect__binary":[True], 
                    # Non zero counts of words or tokens are set to 1
                    # the word exists in the document or not
                    "clf__fit_prior":[True,False]
                    # True : Use previous probabilities
                    # False : Do not use previous probabilities
                }
                ]

# Estimate the model with different hyperparameters
nby1 = GridSearchCV(pip1, 
                    # Order Functions and models to execute
                    param_grid_nb , 
                    # Parameters that will change to contrast results
                    scoring = "accuracy", 
                    # Output to compare
                    cv = 3, 
                    # Number of folds for cross validation
                    verbose = 1, 
                    # Request more information  about the model
                    n_jobs = 4) # Number of processes executed parallelly



### Logit 

In [None]:

# Defining the models that will be used
pip2 = Pipeline([("vect", vectorizer2),
                # TfidfVectorizer : Tokenize strings
                ("clf", LogisticRegression(random_state=42))])
                # LogisticRegression : Model for binary feature

# Defining sets of parameters for the models
param_grid_log = [{ ### First group of parameters
                    
                    ## Parameters of TfidfVectorizer
                    "vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # None : Override tokenization
                    # myTokenizer : Use the function myTokenizer
                    # tokenize_snowball : Use the function tokenize_snowball

                    ## Hyperparamters for LogisticRegression
                    "clf__penalty": ["l2"],
                    # l2 : Type of penalty
                    "clf__C": [8.0, 12.0]
                    # Regularization strength
                    # Smaller values stronger regularization
                }, 
                    ### Second group of parameters

                    ## Parameters of TfidfVectorizer
                {   "vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # Try only unigrams and unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # None : Override tokenization
                    # myTokenizer : Use the function myTokenizer
                    # tokenize_snowball : Use the function tokenize_snowball
                    "vect__use_idf": [False], 
                    # False : Enable inverse-document-frequency reweighting
                    # The previous paremeter by default is True
                    "vect__norm": [None],
                    # It will not normalize the tfidf matrix
                    
                    ## Hyperparamters for LogisticRegression
                    "clf__penalty": ["l1","l2"],
                    # l1 : The norm used in penalization
                    # l2 : The norm used in penalization
                    "clf__C": [1.0, 10.0]}
                    # Regularization strength
                    # Smaller values stronger regularization
             ]

# Estimate the model with different hyperparameters
log1 = GridSearchCV(pip2, 
                    # Order Functions and models to execute
                    param_grid_log,
                    # Parameters that will change to contrast results
                    scoring="recall",
                    # Report  the ratio of true positives samples divide by 
                    # The sum of true positive samples and false negatives
                    cv=3,
                    # Number of folds for cross validation
                    verbose=1,
                    # Request more information  about the model
                    n_jobs=4)# Number of processes executed parallelly



### SVC

In [13]:
## SVC 
# Defining the models that will be used
pip3 = Pipeline([(  "vect", vectorizer2),
                    # TfidfVectorizer : Tokenize strings
                    ("clf", SVC(random_state=42))])
                    # SVC : Support Vector Classification.

# Defining sets of parameters for the models
param_grid_svc = [{## Parameters of TfidfVectorizer
                    "vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed

                    ## SVC hyperparameters
                    "clf__kernel": ["linear","rbf"],
                    # Linear : single line
                    # rbf : Radial basis function kernel,
                    "clf__C": [.1, 1.0, 10.0],
                    # Regularization strength
                    # Smaller values stronger regularization
                    "clf__gamma": [.1, 1.0, 10.0,'auto']}]
                    # Kernel coefficient for rbf
                    # auto : 1/n_features

# Estimate the model with different hyperparameters
svc=GridSearchCV(pip3,
                # Order Functions and models to execute
                param_grid_svc,
                # Parameters that will change to contrast results
                scoring="accuracy",
                # Report  accuracy results
                cv=3, 
                # Number of folds for cross validation
                verbose=1,
                # Request more information  about the model
                n_jobs=4)# Number of processes executed parallelly



### Random Forest

In [14]:
## Random Forest
# Defining the models that will be used
pip4 = Pipeline([(  "vect", vectorizer1),
                    # Countvectorizer : Tokenize strings
                    ("clf", RandomForestClassifier(random_state=0))])
                    # Random Forest Model 

# Defining sets of parameters for the models
param_grid_rf = [{"vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed

                    # Model hyperparameters
                    "clf__max_depth": [None,5,10,20],
                    # Number of nodes
                    "clf__max_features": [None,'log2','sqrt'],
                    # The number of features to consider when looking for the best split
                    "clf__n_estimators": [10,20,30,100]}]
                    # Number of trees in the forest

# Estimate the model with different hyperparameters
rf1 = GridSearchCV(pip4,
                    # Order Functions and models to execute
                    param_grid_rf,
                    # Parameters that will change to contrast results
                    scoring="accuracy",
                    # Report  accuracy results
                    cv=3, 
                    # Number of folds for cross validation
                    verbose=1,
                    # Request more information  about the model
                    n_jobs=4)# Number of processes executed parallelly

### K-nearest Neigbhbor

In [None]:

# Defining the models that will be used
pip5 = Pipeline([("vect", vectorizer2),
                    # TfidfVectorizer : Tokenize strings
                    ("clf", KNeighborsClassifier())])
                    # Model KNN

# Defining sets of parameters for the models
param_grid_KKN = [{"vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed

                    ## Model Hyperparameters
                    "clf__n_neighbors": [3,5,7,10],
                    # Number of neighbors
                    "clf__weights": ['uniform','distance'],
                    # Weight function
                    # uniform : equal weight for each neighbor
                    # distance : based on distance of each neighbor
                    "clf__p": [1,2]}]
                    # Type of distance
                    # 1 : Minkowski distance
                    # 2 : Euclidean distance

# Estimate the model with different hyperparameters
KKN = GridSearchCV(pip5, 
                    # Order Functions and models to execute
                    param_grid_KKN,
                    # Parameters that will change to contrast results
                    scoring="accuracy",
                    # Report  accuracy results
                    cv=3, 
                    # Number of folds for cross validation
                    verbose=1,
                    # Request more information  about the model
                    n_jobs=4) # Number of processes executed parallelly

### Complement Bayes

In [None]:

# Defining the models that will be used
pip6 = Pipeline([("vect", vectorizer1),
                    # Countvectorizer : Tokenize strings
                    ("clf", ComplementNB(alpha=.01))])
                    # Model ComplementNB

# Defining sets of parameters for the models
param_grid_cnb = [{"vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,tokenize_snowball],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                   "vect__binary":[True],
                    # Non zero counts of words or tokens are set to 1
                    # the word exists in the document or not

                    ## Model Hyperparameters
                    "clf__fit_prior":[True,False],
                    # True : Edge case
                    # False : Not edge case
                    "clf__norm":[True,False]}
                    # Normalization
             ]

cnb = GridSearchCV(pip6, 
                    # Order Functions and models to execute
                    param_grid_cnb,
                    # Parameters that will change to contrast results
                    scoring="accuracy",
                    # Report  accuracy results
                    cv=3, 
                    # Number of folds for cross validation
                    verbose=1,
                    # Request more information  about the model
                    n_jobs=4)# Number of processes executed parallelly



### Linear SVC

In [18]:
# Defining the models that will be used
pip7 = Pipeline([("vect", vectorizer2),
                    # TfidfVectorizer : Tokenize strings
                    ("clf", LinearSVC())])
                    # Model Linear SVC

# Defining sets of parameters for the models
param_grid_Lsvc = [{"vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
              
                    ## Model Hyperparameters
                    "clf__C": [.1, 1.0, 10.0],
                    # Regularization strength
                    # Smaller values stronger regularization
                    "clf__penalty":["l1","l2"],
                    # Type of penalty: l1 and l2
                    "clf__dual":[False],
                    # False : n_samples > n_features
                    "clf__multi_class":["ovr","crammer_singer"]},
                    # ovr : Trains n_clases one-vs-rest classifiers
                    # crammer_singer : Joint objective over all classes
                
                ## Second group of parameters
                    {"vect__ngram_range": [(1,1),(1,2)],
                    # Number of contiguous sequences of words in a setence
                    # (1,1) : Try only unigrams
                    # (1,2) : unigrams and bigrams
                    "vect__stop_words": [stop_es, None],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "vect__tokenizer": [None,myTokenizer,
                                  tokenize_snowball],
                    # Words that will be removed from tokens
                    # stop_es : some specific words from spanish vocabulary presumed to be uninformative
                    # None : No words removed
                    "clf__C": [.1, 1.0, 10.0],
                    # Regularization strength
                    # Smaller values stronger regularization
                    "clf__penalty":["l2"],
                    # Type of penalty: only l2
                    "clf__dual":[True,False],
                    # True : Dual optimization
                    # False : Primal optimization
                    "clf__multi_class":["ovr","crammer_singer"]
                    # new need to be testes 
                    # ovr : Trains n_clases one-vs-rest classifiers
                    # crammer_singer : Joint objective over all classes
                  }]

# Estimate the model with different hyperparameters
Lsvc=GridSearchCV(  pip7,
                    # Order Functions and models to execute
                    param_grid_Lsvc,
                    # Parameters that will change to contrast results
                    scoring="accuracy",
                    # Report  accuracy results
                    cv=3, 
                    # Number of folds for cross validation
                    verbose=1,
                    # Request more information  about the model
                    n_jobs=4) # Number of processes executed parallelly



### Wrap up estimation

We write two functions to estimate all the models at once with the same data.

- estimation
This function takes the nby1,svc,rf1,KKN,Lsvc grid objects in a list and fit them with the data.
Then, it  stores the results in a dataframe.

- estimation 2
This function takes the nby1,Lsvc grid objects in a list and fit them with the data.
Then, it  stores the results in a dataframe.

#### Pendings

Get the fpr, tpr, and thresholds and the number of fits by model


In [22]:
# Defining a Pandas DataFrame
scores=pd.DataFrame(columns=['Country','model','accuracy',])

#fits=pd.DataFrame(columns=['Country','model','fits','cv']) #not working



def estimation(X_train_texto, y_train, X_test_texto, y_test, Country):
    '''
    Estimates: Naive bayes, Logit, Vector Machine, Random Forest, KKNeighborghs, 
    Complement Naive Bayes, Linear Vector Machine
    '''
    models=[nby1,svc,rf1,KKN,Lsvc]
    # List all the grids created
    
    names=["NaiveBayes","SVC","RandomF","KNeigbhbor","Linear_SVC"] 
    # Names of the models created

    for model in models:
        model.fit(X_train_texto,y_train)
        pred = model.predict(X_test_texto)
        print(metrics.confusion_matrix(y_test,pred))
        #fpr , tpr, thresholds = roc_curve(y_test, pred)
        #area=metrics.auc(fpr,tpr)
        accuracy=metrics.accuracy_score(y_test, pred)
        scores.loc[models.index(model)]=[Country,names[models.index(model)],accuracy]
#       fits.loc[models.index(model)]=[Country,names[models.index(model)],model,
#                                       model.n_splits_*len(model.cv_results_["mean_fit_time"]),
#                                       model.n_splits_] # Not working
#
#  Pending: 
#  Get the fpr, tpr, and thresholds and the number of fits by model
#  add the models as arguments


def estimation2(X_train_texto, y_train, X_test_texto, y_test, Country):
    '''
    Estimates: Naive bayes, Logit, Vector Machine, Random Forest, KKNeighborghs, Complement Naive Bayes, 
    Linear Vector Machine
    '''
    models=[nby1,Lsvc]
    names=["nby1","Linear_SVC"] 
    for model in models:
        model.fit(X_train_texto,y_train)
        pred = model.predict(X_test_texto)
        print(metrics.confusion_matrix(y_test,pred))
        #fpr , tpr, thresholds = roc_curve(y_test, pred)
        #area=metrics.auc(fpr,tpr)
        accuracy=metrics.accuracy_score(y_test, pred)
        scores.loc[models.index(model)]=[Country,names[models.index(model)],accuracy]
#       fits.loc[models.index(model)]=[Country,names[models.index(model)],model,
#                                       model.n_splits_*len(model.cv_results_["mean_fit_time"]),
#                                       model.n_splits_] # Not working
#

