In [None]:
import pandas as pd
train = pd.read_csv("../data/imbd/train.txt",header=None)
test = pd.read_csv("../data/imbd/test.txt",header=None)
train

In [None]:
from pprint import pprint
from time import time
import logging
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,confusion_matrix

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
#categories = [
    #'alt.atheism',
    #'talk.religion.misc',
#]
# Uncomment the following to do the analysis on all the categories
#categories = None

#print("Loading 20 newsgroups dataset for categories:")
#print(categories)

#print("%d documents" % len(data.filenames))
#print("%d categories" % len(data.target_names))
#print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline(steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__min_df': ([0]),
    #'vect__max_features': (None, 5000, 10000, 100000),
    #'vect__ngram_range': ((1, 1)),  # unigrams or bigrams
    'tfidf__use_idf': ([False]),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__C': ([0.01,0.1]),
    #'clf__penalty': ('l2', 'l1'),
    # 'clf__max_iter': (1,2),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters,cv=2, n_jobs=-1, verbose=5, refit = True,return_train_score = True)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameter set:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train[0], train[1])
    print("done in %0.3fs" % (time() - t0))
    print()
    
    print("scores!")
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
        
    print("Best score:")
    print("%0.3f (+/-%0.03f)" % (grid_search.best_score_, std * 2))
    print("with parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print(grid_search)
    
    

In [None]:
import matplotlib.pyplot as plt
import numpy as np
#import itertools
#from sklearn.metrics import plot_confusion_matrix

test = fetch_20newsgroups(subset='test')
y_test = test.target
y_pred = grid_search.predict(test.data)
#np.mean(x==tets.target)
cnf_matrix = confusion_matrix(y_test,y_pred)
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred,average='micro')))
print('Recall Score : ' + str(recall_score(y_test,y_pred,average='micro')))
print('F1 Score : ' + str(f1_score(y_test,y_pred,average='micro')))
print('Confusion Matrix : \n' + str(cnf_matrix))
class_names = test.target_names

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
def pre_process(self,stop_words,min_df,max_features,ngram_upper_bound):
        train_size = len(self.train["text"])
        test_size = len(self.test["text"])
        vertical_stack = pd.concat([self.train, self.test], axis=0)
        tfidf_transformer = TfidfTransformer()
        if max_features > 0:
            vectorizer = CountVectorizer(ngram_range=(1, ngram_upper_bound),min_df = min_df, max_features= max_features,stop_words=stop_words)
        else:
            vectorizer = CountVectorizer(ngram_range=(1, ngram_upper_bound),min_df = min_df,stop_words=stop_words)
        
        formated_data = vectorizer.fit_transform(vertical_stack["text"])
        formated_data = tfidf_transformer.fit_transform(formated_data).toarray()
        self.X_train = formated_data[0:train_size][:]
        self.X_test = formated_data[train_size:][:]

In [16]:
import model
from sklearn.svm import LinearSVC
c = model.Classifier(1,LinearSVC())
c.baseline_fit()

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': [False], 'vect__min_df': [0]}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.9s remaining:   14.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.0s finished


done in 16.235s

scores!
0.900 (+/-0.004) for {'tfidf__use_idf': False, 'vect__min_df': 0}
Best score:
0.900 (+/-0.004)
with parameters set:
	tfidf__use_idf: False
	vect__min_df: 0

Evaluation on test set:

Accuracy Score : 0.8251460435475305
Precision Score : 0.8251460435475305
Recall Score : 0.8251460435475305
F1 Score : 0.8251460435475305
Confusion Matrix : 
[[239   2   0   0   2   1   0   0   0   0   0   2   1   8   5  24   3   7
    1  24]
 [  3 299  16   7   6  19   3   3   0   6   1   5  10   0   3   2   1   2
    1   2]
 [  2  18 275  38  13  15   4   2   0   6   0   1   4   0   4   1   0   0
    7   4]
 [  1  10  23 276  26   4  15   4   1   0   0   3  26   1   1   0   1   0
    0   0]
 [  0   6   5  17 321   0   8   2   1   3   1   0  13   2   0   0   3   0
    2   1]
 [  1  39  37   5   4 285   4   0   1   0   1   2   8   1   3   1   1   1
    0   1]
 [  1   2   1   9   8   0 346   7   1   1   1   1   6   3   2   0   0   0
    0   1]
 [  0   3   1   1   2   0  12 349   9   3

In [17]:
from sklearn.tree import DecisionTreeClassifier
ccc = model.Classifier(1,DecisionTreeClassifier())

In [10]:
from sklearn.ensemble import RandomForestClassifier
c3 = model.Classifier(1,RandomForestClassifier())
c3.baseline_fit()

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': [False], 'vect__min_df': [0]}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.4s remaining:   18.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.6s finished


done in 19.680s

scores!
0.657 (+/-0.032) for {'tfidf__use_idf': False, 'vect__min_df': 0}
Best score:
0.657 (+/-0.032)
with parameters set:
	tfidf__use_idf: False
	vect__min_df: 0

Evaluation on test set:

Accuracy Score : 0.5594795539033457
Precision Score : 0.5594795539033457
Recall Score : 0.5594795539033457
F1 Score : 0.5594795539033457
Confusion Matrix : 
[[194   7   3   4  10   2   1   3   1   4   5   3   1   5   5  47   7   4
    2  11]
 [  4 201  33  18  27  39  12   5   5   6   2   2  15   2   9   3   0   3
    3   0]
 [  6  59 198  45  17  20   8   5   4   5   0   5   7   1   8   1   2   1
    2   0]
 [  3  48  64 172  32  15  10   6   3   0   2   2  19   4   4   3   2   0
    3   0]
 [  5  38  31  49 182  11  23   8   3   6   4   4  12   2   1   3   1   0
    2   0]
 [  5  69  59  21  16 184   3   4   7   4   2   2   8   4   4   0   1   0
    1   1]
 [  4  18  14  18  17   7 287   9   1   1   2   1   4   3   2   0   2   0
    0   0]
 [ 10  18  20  16  15   9  31 209  23   7

In [20]:
from sklearn.ensemble import AdaBoostClassifier
c4 = model.Classifier(0,AdaBoostClassifier())
c4.baseline_fit()

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': [False], 'vect__min_df': [0]}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   39.4s remaining:   59.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.6s finished


done in 66.577s

scores!
0.353 (+/-0.009) for {'tfidf__use_idf': False, 'vect__min_df': 0}
Best score:
0.353 (+/-0.009)
with parameters set:
	tfidf__use_idf: False
	vect__min_df: 0

Evaluation on test set:

Accuracy Score : 0.0


ValueError: Mix of label input types (string and number)

In [19]:
from sklearn.tree import DecisionTreeClassifier
ccc = model.Classifier(1,DecisionTreeClassifier())
ccc.baseline_fit()

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': [False], 'vect__min_df': [0]}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   25.3s remaining:   37.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   25.8s finished


done in 48.606s

scores!
0.640 (+/-0.012) for {'tfidf__use_idf': False, 'vect__min_df': 0}
Best score:
0.640 (+/-0.012)
with parameters set:
	tfidf__use_idf: False
	vect__min_df: 0

Evaluation on test set:

Accuracy Score : 0.5528412108337759
Precision Score : 0.5528412108337759
Recall Score : 0.5528412108337759
F1 Score : 0.5528412108337759
Confusion Matrix : 
[[136   5   3   2   4   3   3   7   6   8   4   1   7  14   6  25  12   5
   12  56]
 [  4 178  16  23  23  37  20   4   4   7   3   7  20  10  15   8   5   4
    0   1]
 [  1  28 235  34  14  27   5   4   3   7   2   4  11   1   4   1   4   3
    4   2]
 [  2  25  29 158  25  15  13  13   4   7   3   5  51  16   6   5   3   3
    5   4]
 [  1  24  10  23 212   6  22  13   5   3   1   2  29  12  10   4   4   0
    4   0]
 [  1  33  54  18  16 192   3  12   5   7   2   9  11  11   6   1   4   1
    6   3]
 [  3   4   6  24  17   2 268  10   6   5   6   4  11   7   4   3   7   2
    1   0]
 [  3   6   4   8   1  10  20 214  25   4