In [1]:
import json
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import numpy as np

In [2]:
# loading the dataset
with open('../data/dataset.json', 'r') as f:
    dataset = json.load(f)
    positives = dataset['positives']
    negatives = dataset['negatives']

# Shuffling the data
a = [(i, 0) for i in negatives]
b = [(i, 1) for i in positives]
combined = a + b
np.random.shuffle(combined)
shuffled = list(zip(*combined))
text_X = shuffled[0]
labels = shuffled[1]


In [3]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(text_X)

X_train, X_test, y_train, y_test = model_selection.train_test_split(text_X, labels, test_size=0.2, random_state=42)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_test) 

### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import datetime

train_start = datetime.datetime.now()
logreg = LogisticRegression()
logreg.fit(xtrain_tfidf_ngram_chars, y_train)
train_end = datetime.datetime.now()
print("Training time: ", (train_end - train_start))



Training time:  0:01:44.285093


In [6]:
y_pred = logreg.predict(xvalid_tfidf_ngram_chars)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(xvalid_tfidf_ngram_chars, y_test)))

Accuracy of logistic regression classifier on test set: 0.88


In [7]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[258746  28785]
 [ 41944 255866]]


In [None]:
The result is telling us that we have 258746+255866 correct predictions and 28785+41944  incorrect predictions.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### Random Forest Classifier

In [None]:
clf = ensemble.RandomForestClassifier()
clf.fit(xtrain_tfidf_ngram_chars, y_train)
accuracy = clf.score(xvalid_tfidf_ngram_chars, y_test)
print("RF, WordLevel TF-IDF: ", accuracy)

RF, WordLevel TF-IDF:  0.8980850478609904