In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
# from pandas_ml import ConfusionMatrix
from sklearn.model_selection import GridSearchCV

## parameters used for gridsearch
Both pipline for the pre-process data and the original data are search for the best parameters from this parameters list.

In [12]:
parameters = {
    'vect__max_df': (0.25,0.5, 0.75),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (1e-06,1e-05,1e-04,1e-03,1e-02),
    'clf__penalty': ('l1','l2', 'elasticnet'),
    'clf__loss' : ('hinge', 'log'),
    'clf__l1_ratio':(0.1,0.15,0.2)
}

# use the pre-processing data


In [2]:
data = pd.read_csv('dataset/drop_dup.tsv')

In [30]:
data = data.reindex(np.random.permutation(data.index))

In [31]:
x = data.new_phrase
y = data.Sentiment

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 42)

In [35]:
pipeline = Pipeline([
    ('vect', CountVectorizer(token_pattern=r'(?u)[\wA-Za-z][\wA-Za-z]+')),
    ('tfidf', TfidfTransformer()), 
    ('clf', SGDClassifier(random_state=42, verbose=1, loss = 'log',l1_ratio=0.15,max_iter = 10)),
])

In [38]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,n_jobs=-1, verbose=1)

In [39]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 41.6min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 57.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 70.5min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 82.0min
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed: 86.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 77.24, NNZs: 15876, Bias: -1.870356, T: 76512, Avg. loss: 0.135034
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 64.56, NNZs: 14561, Bias: -1.581750, T: 153024, Avg. loss: 0.082272
Total training time: 0.04 seconds.
-- Epoch 3
Norm: 59.44, NNZs: 14493, Bias: -1.408847, T: 229536, Avg. loss: 0.074071
Total training time: 0.06 seconds.
-- Epoch 4
Norm: 56.54, NNZs: 14657, Bias: -1.390110, T: 306048, Avg. loss: 0.070316
Total training time: 0.08 seconds.
-- Epoch 5
Norm: 54.68, NNZs: 14734, Bias: -1.372738, T: 382560, Avg. loss: 0.068083
Total training time: 0.09 seconds.
-- Epoch 6
Norm: 53.56, NNZs: 14836, Bias: -1.336795, T: 459072, Avg. loss: 0.066614
Total training time: 0.11 seconds.
-- Epoch 7
Norm: 52.73, NNZs: 14901, Bias: -1.299349, T: 535584, Avg. loss: 0.065655
Total training time: 0.13 seconds.
-- Epoch 8
Norm: 52.04, NNZs: 15069, Bias: -1.283320, T: 612096, Avg. loss: 0.064779
Total training time: 0.15 seconds.
-- Epoch 9
Norm: 51.52, NNZs: 15156, Bias

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [40]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.620
Best parameters set:
	clf__alpha: 1e-05
	clf__l1_ratio: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'elasticnet'
	tfidf__norm: 'l2'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)


In [41]:
grid_search = grid_search.best_estimator_
y_pred = grid_search.predict(X_test)

In [42]:
print ("Accuracy = "+ str(np.mean(y_pred == y_test)))
my_tags = ['neg','somehow neg','neutral','somehow pos','pos']

print(metrics.classification_report(y_test, y_pred,
    target_names=my_tags))

Accuracy = 0.6259605834073919
              precision    recall  f1-score   support

         pos       0.48      0.25      0.32       806
 somehow pos       0.51      0.35      0.41      3209
     neutral       0.68      0.86      0.76     10087
 somehow neg       0.55      0.42      0.48      3952
         neg       0.50      0.30      0.38      1075

    accuracy                           0.63     19129
   macro avg       0.54      0.44      0.47     19129
weighted avg       0.60      0.63      0.60     19129



# Use the original data

In [3]:
data = pd.read_csv('dataset/train.tsv',sep='\t')

In [4]:
data = data.reindex(np.random.permutation(data.index))

In [7]:
x2 = data.Phrase
y2 = data.Sentiment

In [8]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.20, random_state = 42)

In [9]:
pipeline2 = Pipeline([
    ('vect', CountVectorizer(token_pattern=r'(?u)[\wA-Za-z][\wA-Za-z]+')), 
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=42, verbose=1, loss = 'log',l1_ratio=0.15,max_iter = 10)),
])

In [13]:
# pipeline2 is search for the best parameters with parameters set provide at the start of the code
grid_search2 = GridSearchCV(pipeline2, parameters, cv=5,n_jobs=-1, verbose=1)

In [14]:
grid_search2.fit(X_train2, y_train2)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 50.9min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 65.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 80.0min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 96.7min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 115.2min
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed: 123.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 301.36, NNZs: 93706, Bias: -6.003200, T: 124848, Avg. loss: 0.248928
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 245.25, NNZs: 93706, Bias: -5.908272, T: 249696, Avg. loss: 0.109648
Total training time: 0.07 seconds.
-- Epoch 3
Norm: 229.21, NNZs: 93706, Bias: -5.574096, T: 374544, Avg. loss: 0.087679
Total training time: 0.10 seconds.
-- Epoch 4
Norm: 222.81, NNZs: 93706, Bias: -5.479595, T: 499392, Avg. loss: 0.079450
Total training time: 0.14 seconds.
-- Epoch 5
Norm: 218.92, NNZs: 93706, Bias: -5.578784, T: 624240, Avg. loss: 0.075548
Total training time: 0.17 seconds.
-- Epoch 6
Norm: 216.73, NNZs: 93706, Bias: -5.464679, T: 749088, Avg. loss: 0.073026
Total training time: 0.20 seconds.
-- Epoch 7
Norm: 215.52, NNZs: 93706, Bias: -5.328200, T: 873936, Avg. loss: 0.071367
Total training time: 0.25 seconds.
-- Epoch 8
Norm: 213.96, NNZs: 93706, Bias: -5.473246, T: 998784, Avg. loss: 0.070153
Total training time: 0.29 seconds.
-- Epoch 9
Norm: 213.21, NNZs: 9

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [15]:
print("Best score: %0.3f" % grid_search2.best_score_)
print("Best parameters set:")
best_parameters2 = grid_search2.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters2[param_name]))

Best score: 0.648
Best parameters set:
	clf__alpha: 1e-06
	clf__l1_ratio: 0.1
	clf__loss: 'log'
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)


In [16]:
grid_search2 = grid_search2.best_estimator_
y_pred2 = grid_search2.predict(X_test2)

In [17]:
print ("Accuracy = "+ str(np.mean(y_pred2 == y_test2)))
my_tags = ['neg','somehow neg','neutral','somehow pos','pos']

print(metrics.classification_report(y_test2, y_pred2,
    target_names=my_tags))

Accuracy = 0.6554850698449315
              precision    recall  f1-score   support

         pos       0.51      0.38      0.44      1428
 somehow pos       0.55      0.48      0.51      5435
     neutral       0.72      0.82      0.77     15989
 somehow neg       0.58      0.53      0.55      6523
         neg       0.55      0.43      0.48      1837

    accuracy                           0.66     31212
   macro avg       0.58      0.53      0.55     31212
weighted avg       0.64      0.66      0.65     31212



In [None]:
from sklearn.externals import joblib
joblib.dump(grid_search, 'grid_sgd_dup.pkl')
joblib.dump(grid_search2, 'grid_sgd_ori.pkl')