<img src='https://image.slidesharecdn.com/workshop-151015094137-lva1-app6892/95/-2-638.jpg?cb=1444902704'>

In [4]:
import pandas as pd
import numpy as np

n = ['id', 'date','name','text','typr','rep','rtw','faw','stcount','foll','frien','listcount']
data_positive = pd.read_csv('data/positive.csv', sep=';',error_bad_lines=False, names=n, usecols=['text'])
data_negative = pd.read_csv('data/negative.csv', sep=';',error_bad_lines=False, names=n, usecols=['text'])


sample_size = min(data_positive.shape[0], data_negative.shape[0])
raw_data = np.concatenate((data_positive['text'].values[:sample_size], 
                           data_negative['text'].values[:sample_size]), axis=0) 

labels = [1]*sample_size + [0]*sample_size

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [6]:
x_train, x_test, y_train, y_test = train_test_split(raw_data, labels, test_size=0.33, random_state=1)

In [9]:
%%time
from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(x_train, y_train)

print(classification_report(y_test, clf.predict(x_test), digits=4))

              precision    recall  f1-score   support

           0     0.7287    0.8297    0.7759     36815
           1     0.8038    0.6931    0.7443     37055

   micro avg     0.7612    0.7612    0.7612     73870
   macro avg     0.7662    0.7614    0.7601     73870
weighted avg     0.7664    0.7612    0.7601     73870

Wall time: 57min 55s
