In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

## Import data and label columns

In [2]:
colnames = ['label', 'id', 'date', 'query', 'user', 'text']
df_train = pd.read_csv('data/training.1600000.processed.noemoticon.csv',
                      header=None, names=colnames, encoding='windows-1252')
df_test = pd.read_csv('data/testdata.manual.2009.06.14.csv',
                      header=None, names=colnames, encoding='windows-1252')

## Check data is loaded properly

In [3]:
print(df_train.shape)
df_train.head()
df_train['label'].value_counts() # should have only two classes

(1600000, 6)


4    800000
0    800000
Name: label, dtype: int64

## Bag of Words using `CountVectorizer()`

`CountVectorizer()`
Convert a collection of text documents to a matrix of token counts. This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

In [4]:
bow_vector = CountVectorizer()
train_bow = bow_vector.fit_transform(df_train.text)
print(train_bow.shape)

test_bow = bow_vector.transform(df_test.text)
mnb_classifier = MultinomialNB().fit(train_bow, df_train['label'])
prediction = mnb_classifier.predict(test_bow)

print(prediction)

(1600000, 685256)
[0 4 4 4 0 4 0 4 4 4 4 0 4 4 0 0 0 4 0 0 4 4 0 0 0 4 4 4 4 4 4 4 4 0 4 0 0
 0 4 0 4 0 0 0 0 0 4 4 4 0 0 4 4 4 0 4 4 4 4 0 4 0 4 4 0 0 4 4 0 4 4 4 0 4
 4 0 4 0 4 0 4 0 4 4 4 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 4 4 4 0 0 0 4 4 4 4 4
 4 4 0 4 4 4 4 4 4 4 4 4 0 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 4 0 4 4 0 4 4 4 0 0 0 0 0 4 0 0 0 0 4 4 4 4 0 0 0 4 4 0 0 0 4 0
 0 4 4 4 0 4 4 4 4 0 4 4 4 4 4 4 4 4 4 4 4 4 4 0 4 0 0 0 0 0 4 4 0 0 0 0 0
 0 0 0 0 4 0 0 0 0 4 4 4 0 0 4 4 4 4 4 0 4 4 4 4 4 4 4 4 0 0 0 4 4 0 0 4 0
 4 4 4 4 0 4 0 4 0 4 4 4 0 0 4 4 4 0 4 4 4 0 0 0 0 4 4 0 4 4 4 4 4 0 0 4 0
 0 0 0 0 4 0 0 4 0 0 4 4 4 4 4 4 0 4 0 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 4 0
 0 0 4 0 4 0 4 0 0 0 0 4 4 4 4 4 4 0 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 0 4 0 0 4 0 0 4 4 0 0 0 0 0 0 4 0 0 4 0 0 0 0 0 0 4 0 0 4 4 4 0 0 0 0 0
 0 0 0 0 4 0 4 4 4 4 0 0 4 0 0 0 4 4 4 0 4 4 0 4 4 4 4 4 4 4 4 4 4 0 4 4 4
 4 4 4 4 4 4 0 4 4 0 4 0 0 4 0 4 0 4 4 4 0 0 4 4 0 4 0 4 0 0 0 4 4 4 4 0 0
 0 0 4 

## Evaluation

In [5]:
print(metrics.classification_report(df_test['label'], prediction))
print(accuracy_score(df_test['label'], prediction))

             precision    recall  f1-score   support

          0       0.68      0.82      0.74       177
          2       0.00      0.00      0.00       139
          4       0.51      0.79      0.62       182

avg / total       0.43      0.58      0.49       498

0.5823293172690763


  'precision', 'predicted', average, warn_for)
