# Task 01


Create a benchmark analysis with different algorithms and feature extractors:

Dataset: Fetch 20 Newsgroups (same as in class work)
Algorithms: Multinomial Naïve Bayes, Logistic Regression, Support Vector Machines, Decision Trees
Feature Extractors: CountVectorizer, Word2Vec, Doc2Vec and so on

Benchmark all the possible above configurations and choose the best algorithm and feature extractor amongst all configurations and put it in a .txt or .doc file in a tabular format.


In [1]:
# from pprint import pprint
from time import time
# import logging
import warnings
import re
import pandas as pd
import numpy as np
from sklearn import utils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import gensim
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
# To see all the content in the column
pd.set_option('max_colwidth', 1000)
warnings.filterwarnings(action='ignore')

In [2]:
# Load some categories from the training set
categories = [
              'comp.graphics',
              'talk.politics.guns'
             ]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['comp.graphics', 'talk.politics.guns']


### Fetch documents for these 2 categories

In [4]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

1130 documents
2 categories



### Checking content

In [5]:
print(data.target)

[1 0 0 ... 0 1 0]


In [6]:
data.data[0]



### Define a pipeline combining a text feature extractor with a simple classifier

### Find the best parameters for both the feature extraction and the classifier

# Defining Models:

### Word2Vec


In [7]:
# Defining Training and testing
X=[]
for sentence in data.data:
    sentence_tokenised = word_tokenize(sentence)
    X.append(sentence_tokenised)
y=data.target
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (data.data, data.target , test_size=0.2)
# print(len(X_train), len(y_train), len(X_test),len(y_test))

In [8]:
X_train[1]

'From: cescript@mtu.edu (Charles Scripter)\nSubject: Re: Some more about gun control...\nNntp-Posting-Host: fishlab3.fsh.mtu.edu\nOrganization: Help, my server\'s fallen, and can\'t get up (MTU)\nX-Newsreader: TIN [version 1.1 PL8]\nLines: 185\n\nIn article <C5Bu9M.2K7@ulowell.ulowell.edu>\njrutledg@cs.ulowell.edu (John Lawrence Rutledge) wrote:\n\n> In article <1q96tpINNpcn@gap.caltech.edu> arc@cco.caltech.edu\n> (Aaron Ray Clements) writes:\n> >The Second Amendment is a guarantee of the right to bear arms.  Clearly\n> >and unequivocally, without infringement.\n\n> Unfortunately the Second Amendment is not as clear as you state.  If last \n> part of it is taken along, it follows what you have said.  The problem\n> I have is with the first part of the single sentence which makes up the\n> amendment.  The Second Amendment is:\n\n> \tA well regulated militia, being necessary to the security \n                         ^^^^^^^ Militia\n\n> \tof a free state, the right of the people to keep

In [9]:
y_train[1]

1

In [10]:
%time
# Train the word2vec model
model = Word2Vec(sentences = X_train, 
                 vector_size  = 300, 
                 sg = 0, # sg=0: cbow;  sg=1: skipgram
                 window = 5, 
                 min_count = 2, 
                 epochs = 30, 
                 workers = Pool()._processes)

CPU times: total: 0 ns
Wall time: 0 ns


In [11]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
words = set(model.wv.index_to_key )
X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words])  
                         for ls in X_train])
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words])   
                        for ls in X_test])

In [12]:
# print(len(X_train_vect), len(X_test_vect))
print(X_train_vect[:2], X_test_vect[:2])

[array([[ 1.4822696 ,  0.5172229 ,  0.00353569, ...,  0.9546994 ,
         -1.560929  ,  0.08340311],
        [-0.43393812,  0.06147608,  0.4284799 , ...,  0.23191051,
          0.40643778, -0.13344742],
        [-0.20166253, -0.26063353,  0.2960698 , ..., -0.04605021,
          0.15982087,  0.0581563 ],
        ...,
        [ 0.20323786, -0.05370469,  0.17035607, ...,  0.21469137,
          0.01777811,  0.34219688],
        [-0.6061727 , -0.20583513,  0.6784299 , ...,  0.34874967,
          0.01030018, -0.42454785],
        [-0.54098654,  0.15488732,  0.0456658 , ..., -0.15958712,
         -0.15108152,  0.25465977]], dtype=float32)
 array([[ 1.4822696 ,  0.5172229 ,  0.00353569, ...,  0.9546994 ,
         -1.560929  ,  0.08340311],
        [-0.43393812,  0.06147608,  0.4284799 , ...,  0.23191051,
          0.40643778, -0.13344742],
        [-0.20166253, -0.26063353,  0.2960698 , ..., -0.04605021,
          0.15982087,  0.0581563 ],
        ...,
        [-0.73644614, -2.009004  ,  0.08

In [13]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [14]:
models= {
#          'nb':MultinomialNB(),
         'lrWord2Vec': LogisticRegression(),
#          'rf': RandomForestClassifier( random_state=49), 
         'dtcWord2Vec': DecisionTreeClassifier(),
#          'svc': SVC(),
#          'sgdword2Vec': SGDClassifier(tol=1e-3),
        }

In [15]:
word2vec_parameter_grid = {
                           'lrWord2Vec':{
                                          "lrWord2Vec__penalty": [ 'none', 'l2', ], # 'none', 'elasticnet'
                                          "lrWord2Vec__C": [0.001, 0.01, 0.1, 0.5, 1, 1.5],
                                          # "lr__solver": ['newton-cg', 'lbfgs', 'liblinear'], #'sag', 'saga'],
                                          "lrWord2Vec__max_iter": [500],
                                          "lrWord2Vec__multi_class": ['auto'],
                                          "lrWord2Vec__n_jobs": [-1],
                                          # "lr__l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
                        },
                           'dt': {
                                           "dt__min_samples_split":[2,4],
                            
                        },
#                         'rf': {
#                                # "rf__criterion": [ 'gini', 'entropy', 'log_loss'],
#                                # "rf__n_estimators": [20, 50, 100],
#                                "rf__max_depth": [None, 2, 4, 6],
#                                # "rf__min_samples_split": [2,5, 10, 15, 20, 30, 50],
#                               },
#                         'nb': {  
#                         },
#                         'svc': {
#                                 "svc__C": [ 0, 0.5],
#                                 # "svc__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
#                                 # "svc__gamma": ['scale', 'auto'],
#                                 #"svc__shrinking": [True, False],
#                                }
                          'dtcWord2Vec': {
                                           "dtcWord2Vec__min_samples_split":[2,4],
                            
                        },
#                         'sgdword2Vec': {
#                                 'sgdword2Vec__max_iter': (20,),
#                                 'sgdword2Vec__alpha': (0.00001, 0.000001),
#                                 'sgd_word2Vec__penalty': ('l2', 'elasticnet'),
#                         },
                        }

In [16]:
%%time
results_dict={}
cv = 2            # Cross validation

comparison_matrix = pd.DataFrame(columns=['Model', 
                                          'Pipeline', 
                                          'Best_Estimator',
                                          'Best_Accuracy'])

print("[+] Processing {} models. Please, wait...".format(len(models)))

for model in models:
    print("\nAlgorithm being processed: {}".format(model))
    pipeline = Pipeline([(model, models[model])])  
    grid_search_pipe = GridSearchCV(pipeline, 
                                param_grid=word2vec_parameter_grid[model], 
                                cv=cv,
                                n_jobs=-1, 
                                verbose=1)
    grid_search_pipe.fit(X_train_vect_avg, y_train)

    print("Best score: %0.3f \n" % grid_search_pipe.best_score_)
#     print("Best Parameters: {} \n".format( grid_search_pipe.best_estimator_.get_params()))
    best_parameters = grid_search_pipe.best_estimator_.get_params()
    #print("Best estimator: {} \n".format( grid_search_pipe.best_estimator_))
    series_aux =pd.Series(data=[model,
                                models[model],
                                grid_search_pipe.best_estimator_,
                                grid_search_pipe.best_score_],
                          index=comparison_matrix.columns)
    

        
    comparison_matrix = comparison_matrix.append(series_aux, 
                                                 ignore_index=True)

print("[+] Finish Processing\n") 

[+] Processing 2 models. Please, wait...

Algorithm being processed: lrWord2Vec
Fitting 2 folds for each of 12 candidates, totalling 24 fits
Best score: 0.820 


Algorithm being processed: dtcWord2Vec
Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best score: 0.710 

[+] Finish Processing

CPU times: total: 1.17 s
Wall time: 5.92 s


In [17]:
# comparison_matrix 

## Doc2Vec

In [18]:
from gensim.models import doc2vec

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the complaint narrative.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [19]:
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [20]:
X_train[1]

TaggedDocument(words=['From:', 'cescript@mtu.edu', '(Charles', 'Scripter)', 'Subject:', 'Re:', 'Some', 'more', 'about', 'gun', 'control...', 'Nntp-Posting-Host:', 'fishlab3.fsh.mtu.edu', 'Organization:', 'Help,', 'my', "server's", 'fallen,', 'and', "can't", 'get', 'up', '(MTU)', 'X-Newsreader:', 'TIN', '[version', '1.1', 'PL8]', 'Lines:', '185', 'In', 'article', '<C5Bu9M.2K7@ulowell.ulowell.edu>', 'jrutledg@cs.ulowell.edu', '(John', 'Lawrence', 'Rutledge)', 'wrote:', '>', 'In', 'article', '<1q96tpINNpcn@gap.caltech.edu>', 'arc@cco.caltech.edu', '>', '(Aaron', 'Ray', 'Clements)', 'writes:', '>', '>The', 'Second', 'Amendment', 'is', 'a', 'guarantee', 'of', 'the', 'right', 'to', 'bear', 'arms.', 'Clearly', '>', '>and', 'unequivocally,', 'without', 'infringement.', '>', 'Unfortunately', 'the', 'Second', 'Amendment', 'is', 'not', 'as', 'clear', 'as', 'you', 'state.', 'If', 'last', '>', 'part', 'of', 'it', 'is', 'taken', 'along,', 'it', 'follows', 'what', 'you', 'have', 'said.', 'The', 'prob

In [21]:
len(all_data)

1130

In [22]:
all_data[:2]

[TaggedDocument(words=['From:', 'chuck@eng.umd.edu', '(Chuck', 'Harris', '-', 'WA3UQV)', 'Subject:', 'Re:', 'CNN', 'for', 'sale', 'Organization:', 'University', 'of', 'Maryland,', 'Department', 'of', 'Electrical', 'Engineering', 'Lines:', '11', 'Distribution:', 'usa', 'NNTP-Posting-Host:', 'bree.eng.umd.edu', 'In', 'article', '<C5soMx.HMD@boi.hp.com>', 'kde@boi.hp.com', '(Keith', 'Emmen)', 'writes:', '>If', 'anyone', 'is', 'keeping', 'a', 'list', 'of', 'the', 'potential', 'contributors,', '>you', 'can', 'put', 'me', 'down', 'for', '$1000.00', 'under', 'the', 'conditions', 'above', 'Seems', 'to', 'me', 'folks,', 'that', 'if', 'you', 'are', 'so', 'interested', 'in', 'acquiring', 'CNN,', 'just', 'buy', 'your', '$1000', 'worth', 'of', 'stock', 'today.', "It's", 'being', 'traded', 'everyday.', 'After', 'you', 'own', 'your', 'piece,', 'we', 'can', 'work', 'on', 'the', 'proxy', 'votes', 'later.', "It's", 'probably', 'even', 'a', 'good', 'investment.', 'Chuck', 'Harris', '-', 'WA3UQV', 'chuck@

### Training the model

We'll instantiate a Doc2Vec model-Distributed Bag of Words (DBOW). In the Word2Vec architecture, the two algorithm names are “continuous bag of words” (cbow) and “skip-gram” (sg); in the Doc2Vec architecture, the corresponding algorithms are “distributed bag of words” (dbow) and “distributed memory” (dm).

### DBOW

DBOW is the Doc2Vec model analogous to Skip-gram model in Word2Vec. The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.

Training a Doc2Vec model is rather straight forward in Gensim, we initialize the model and train for 30 epochs:

dm =0 means ‘distributed bag of words’ (DBOW), set min_count=2 means ignoring all words with total frequency lower than this, size=100 is dimensionality of the generated feature vectors, alpha=0.025 is the initial alpha rate, learning rate will linearly drop to min_alpha as training progresses. And then we build a vocabulary.

In [23]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]


In [24]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|█████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<00:00, 1134138.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<00:00, 1130621.07it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:00<?, ?it/s]
100%|███████████████████████████████████

CPU times: total: 17.8 s
Wall time: 6.65 s





In [25]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [26]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [27]:
models= {
#          'nb':MultinomialNB(),
         'lrDoc2Vec': LogisticRegression(),
#          'rf': RandomForestClassifier( random_state=49), 
         'dtcDoc2Vec': DecisionTreeClassifier(),
#          'svc': SVC(),
#          'sgdword2Vec': SGDClassifier(tol=1e-3),
        }

In [28]:
doc2vec_parameter_grid = {
                           'lrDoc2Vec':{
                                          "lrDoc2Vec__penalty": [ 'none', 'l2', ], # 'none', 'elasticnet'
                                          "lrDoc2Vec__C": [0.001, 0.01, 0.1, 0.5, 1, 1.5],
                                          # "lr__solver": ['newton-cg', 'lbfgs', 'liblinear'], #'sag', 'saga'],
                                          "lrDoc2Vec__max_iter": [500],
                                          "lrDoc2Vec__multi_class": ['auto'],
                                          "lrDoc2Vec__n_jobs": [-1],
                                          # "lr__l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
                        },
                           'dtDoc2Vec': {
                                           "dtDoc2Vec__min_samples_split":[2,4],
                            
                        },
#                         'rf': {
#                                # "rf__criterion": [ 'gini', 'entropy', 'log_loss'],
#                                # "rf__n_estimators": [20, 50, 100],
#                                "rf__max_depth": [None, 2, 4, 6],
#                                # "rf__min_samples_split": [2,5, 10, 15, 20, 30, 50],
#                               },
#                         'nb': {  
#                         },
#                         'svc': {
#                                 "svc__C": [ 0, 0.5],
#                                 # "svc__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
#                                 # "svc__gamma": ['scale', 'auto'],
#                                 #"svc__shrinking": [True, False],
#                                }
                          'dtcDoc2Vec': {
                                           "dtcDoc2Vec__min_samples_split":[2,4],
                            
                        },
#                         'sgdword2Vec': {
#                                 'sgdword2Vec__max_iter': (20,),
#                                 'sgdword2Vec__alpha': (0.00001, 0.000001),
#                                 'sgd_word2Vec__penalty': ('l2', 'elasticnet'),
#                         },
                        }

In [29]:
%%time
results_dict={}
cv = 2            # Cross validation

# comparison_matrix = pd.DataFrame(columns=['Model', 
#                                           'Pipeline', 
#                                           'Best_Estimator',
#                                           'Best_Accuracy'])

print("[+] Processing {} models. Please, wait...".format(len(models)))

for model in models:
    print("\nAlgorithm being processed: {}".format(model))
    pipeline = Pipeline([(model, models[model])])  
    grid_search_pipe = GridSearchCV(pipeline, 
                                param_grid=doc2vec_parameter_grid[model], 
                                cv=cv,
                                n_jobs=-1, 
                                verbose=1)
    grid_search_pipe.fit(X_train_vect_avg, y_train)

    print("Best score: %0.3f \n" % grid_search_pipe.best_score_)
#     print("Best Parameters: {} \n".format( grid_search_pipe.best_estimator_.get_params()))
    best_parameters = grid_search_pipe.best_estimator_.get_params()
    #print("Best estimator: {} \n".format( grid_search_pipe.best_estimator_))
    series_aux =pd.Series(data=[model,
                                models[model],
                                grid_search_pipe.best_estimator_,
                                grid_search_pipe.best_score_],
                          index=comparison_matrix.columns)
    

        
    comparison_matrix = comparison_matrix.append(series_aux, 
                                                 ignore_index=True)

print("[+] Finish Processing\n") 

[+] Processing 2 models. Please, wait...

Algorithm being processed: lrDoc2Vec
Fitting 2 folds for each of 12 candidates, totalling 24 fits
Best score: 0.820 


Algorithm being processed: dtcDoc2Vec
Fitting 2 folds for each of 2 candidates, totalling 4 fits
Best score: 0.708 

[+] Finish Processing

CPU times: total: 750 ms
Wall time: 2.53 s


In [30]:
comparison_matrix 

Unnamed: 0,Model,Pipeline,Best_Estimator,Best_Accuracy
0,lrWord2Vec,LogisticRegression(),"(LogisticRegression(C=0.001, max_iter=500, n_jobs=-1, penalty='none'))",0.81969
1,dtcWord2Vec,DecisionTreeClassifier(),(DecisionTreeClassifier(min_samples_split=4)),0.710177
2,lrDoc2Vec,LogisticRegression(),"(LogisticRegression(C=0.001, max_iter=500, n_jobs=-1, penalty='none'))",0.81969
3,dtcDoc2Vec,DecisionTreeClassifier(),(DecisionTreeClassifier()),0.707965


### CountVectorizer and TfidfTransformer

In [31]:
models= {
         'nb':MultinomialNB(),
         'lr': LogisticRegression(),
#          'rf': RandomForestClassifier( random_state=49), 
         'dt': DecisionTreeClassifier(),
#          'svc': SVC(),
         'sgd': SGDClassifier(tol=1e-3),
        }

feature_extractor_pipelines = {
                                "Pipeline_CountVectorizer":[('vect', CountVectorizer())],
                                "Pipeline_TFIDF": [('vect', CountVectorizer()),
                                                   ('tfidf', TfidfTransformer())],                                         
                              }

# "master_parameter_grid" dictionary stores the parameters fr all the models that support Pipeline
master_parameter_grid = {
                        'vect': {
                                 'vect__max_df': (0.5, 0.75, 1.0),
                                 'vect__max_features': (None, 5000, 10000, 50000),
                                 'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams   
                                },
                        'tfidf': {
                                  'tfidf__use_idf': (True, False),
                                  'tfidf__norm': ('l1', 'l2'),
                                },
                        'lr':{
                              "lr__penalty": [ 'none', 'l2', ], # 'none', 'elasticnet'
                              # "lr__C": [0.001, 0.01, 0.1, 0.5, 1, 1.5],
                              # "lr__solver": ['newton-cg', 'lbfgs', 'liblinear'], #'sag', 'saga'],
                              "lr__max_iter": [500],
                              "lr__multi_class": ['auto'],
                              "lr__n_jobs": [-1],
                              # "lr__l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
                        },
#                         'rf': {
#                                # "rf__criterion": [ 'gini', 'entropy', 'log_loss'],
#                                # "rf__n_estimators": [20, 50, 100],
#                                "rf__max_depth": [None, 2, 4, 6],
#                                # "rf__min_samples_split": [2,5, 10, 15, 20, 30, 50],
#                               },
                        'nb': { 
                                "nb__alpha":[0,0,5,1]
                        },
#                         'svc': {
#                                 "svc__C": [ 0, 0.5],
#                                 # "svc__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
#                                 # "svc__gamma": ['scale', 'auto'],
#                                 #"svc__shrinking": [True, False],
#                                }
#                         'dt': {
#                                "dt__min_samples_split":[2,4],
                            
#                         },
#                         'sgd': {
#                                 'clf__max_iter': (20,),
#                                 'clf__alpha': (0.00001, 0.000001),
#                                 'clf__penalty': ('l2', 'elasticnet'),
#                         },
                        }

In [32]:
%%time

results_dict= {}  # Dictionary of results
# cv = 2            # Cross validation

print("[+] Processing {} models with {} feature extractors strategies per model. Total = {}.\nPlease, wait...\n".
                        format(len(models), len(feature_extractor_pipelines), len(models)*len(feature_extractor_pipelines)))

for model in models:
    dict_aux={}
    
    for feature_extractor_pipe in feature_extractor_pipelines.values():
        steps = feature_extractor_pipe + [(model, models[model])]       # adds the classifier model to the current pipeline 
        pipeline = Pipeline(steps)                                      # of extractors
        print("Pipeline: {}".format(pipeline))
        
        # parameters ={} --> Dictionay of parameters that will be built in the next for loop. It contains the 
        # extractors parameters + classifier parameters defined in the "master_parameter_grid" dictionary 
        parameters ={}       
                             
        for step in steps:    
            # "step[0]" is the name to identify the pipeline steps and is stored as the key in "master_parameter_grid 
            if step[0] in master_parameter_grid:  
                # the final parameter grid will be built according to the steps in the current pipeline
                parameters = {**parameters, **master_parameter_grid[step[0]] }  # ** operator merges two dictionaries 
        grid_search_pipe = GridSearchCV(pipeline, 
                                        param_grid=parameters, 
                                        cv=cv,
                                        n_jobs=-1, 
                                        verbose=1)
        grid_search_pipe.fit(data.data, data.target)
        print("Best score: %0.3f \n" % grid_search_pipe.best_score_)
        # print("Best Parameters: {} \n".format( grid_search.best_estimator_.get_params()))
        # print("Best estimator: {} \n".format( grid_search_pipe.best_estimator_))

        dict_aux[pipeline] = [grid_search_pipe.best_score_, grid_search_pipe.best_estimator_]
    
    results_dict[model] = dict_aux
    print("Model {} finished processing {} feature extractors".format(model, len(feature_extractor_pipelines)))

[+] Processing 4 models with 2 feature extractors strategies per model. Total = 8.
Please, wait...

Pipeline: Pipeline(steps=[('vect', CountVectorizer()), ('nb', MultinomialNB())])
Fitting 2 folds for each of 96 candidates, totalling 192 fits
Best score: 0.997 

Pipeline: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('nb', MultinomialNB())])
Fitting 2 folds for each of 384 candidates, totalling 768 fits
Best score: 0.994 

Model nb finished processing 2 feature extractors
Pipeline: Pipeline(steps=[('vect', CountVectorizer()), ('lr', LogisticRegression())])
Fitting 2 folds for each of 48 candidates, totalling 96 fits
Best score: 0.984 

Pipeline: Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression())])
Fitting 2 folds for each of 192 candidates, totalling 384 fits
Best score: 0.990 

Model lr finished processing 2 feature extractors
Pipeline: Pipeline(steps=[('vect', CountVec

In [33]:
for dict in results_dict:
    for pipe in results_dict[dict]:
        series_aux =pd.Series(data=[                        # ['Model', 'Pipeline', 'Best_Estimator','Best_Accuracy']
                                    dict, 
                                    pipe, 
                                    results_dict[dict][pipe][1], 
                                    results_dict[dict][pipe][0]
                                   ],  
                              index=comparison_matrix.columns)       
        comparison_matrix = comparison_matrix.append(series_aux, 
                                                     ignore_index=True)
print("[+] Finish Processing") 

[+] Finish Processing


In [34]:
comparison_matrix.sort_values(['Best_Accuracy'], 
                              ascending=False, 
                              ignore_index=True,
                              inplace=True)
comparison_matrix

Unnamed: 0,Model,Pipeline,Best_Estimator,Best_Accuracy
0,nb,"(CountVectorizer(), MultinomialNB())","(CountVectorizer(max_df=0.5, max_features=10000), MultinomialNB(alpha=1))",0.997345
1,nb,"(CountVectorizer(), TfidfTransformer(), MultinomialNB())","(CountVectorizer(max_df=0.75, max_features=10000, ngram_range=(1, 2)), TfidfTransformer(norm='l1', use_idf=False), MultinomialNB(alpha=0))",0.993805
2,lr,"(CountVectorizer(), TfidfTransformer(), LogisticRegression())","(CountVectorizer(max_df=0.5, max_features=10000), TfidfTransformer(), LogisticRegression(max_iter=500, n_jobs=-1, penalty='none'))",0.990265
3,sgd,"(CountVectorizer(), TfidfTransformer(), SGDClassifier())","(CountVectorizer(max_df=0.5), TfidfTransformer(), SGDClassifier())",0.990265
4,lr,"(CountVectorizer(), LogisticRegression())","(CountVectorizer(max_df=0.5, max_features=10000), LogisticRegression(max_iter=500, n_jobs=-1, penalty='none'))",0.984071
5,sgd,"(CountVectorizer(), SGDClassifier())","(CountVectorizer(max_df=0.75, ngram_range=(1, 2)), SGDClassifier())",0.980531
6,dt,"(CountVectorizer(), DecisionTreeClassifier())","(CountVectorizer(max_df=0.5, max_features=10000, ngram_range=(1, 2)), DecisionTreeClassifier())",0.957522
7,dt,"(CountVectorizer(), TfidfTransformer(), DecisionTreeClassifier())","(CountVectorizer(max_df=0.5, max_features=10000), TfidfTransformer(norm='l1'), DecisionTreeClassifier())",0.953097
8,lrWord2Vec,LogisticRegression(),"(LogisticRegression(C=0.001, max_iter=500, n_jobs=-1, penalty='none'))",0.81969
9,lrDoc2Vec,LogisticRegression(),"(LogisticRegression(C=0.001, max_iter=500, n_jobs=-1, penalty='none'))",0.81969


In [35]:
comparison_matrix.to_csv('Jose_Lira_Task01_Text_Classification.txt', index=False)