In [1]:
import nltk
from nltk.classify.scikitlearn import SklearnClassifier  # wrapper to include the scikit-learn algorithms within the nltk classifier
from sklearn.naive_bayes import GaussianNB
from nltk.corpus import stopwords
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report

In [2]:
training_size = 1600000
data_folder = 'trainingandtestdata'
train_file = './'+data_folder+'/'+'training.csv'
test_file = './' + data_folder+'/'+'testing.csv'
wordbag_dest = './' + data_folder + '/wordbag.csv'
positive_wordbag_dest = './' + data_folder + '/positive_wordbag.csv'
negative_wordbag_dest = './' + data_folder + '/negative_wordbag.csv'

In [3]:
# NEW START
import re
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()

df = pd.read_csv(train_file)
del df['Id']
del df['Date']
del df['Query']
del df['Username']
df_pos = df[df['Y']==0]
df_neg = df[df['Y']==4]

import numpy as np
import pandas as pd
sample_size = 100000
raw_data = np.concatenate((df_pos['text'].values[:sample_size], 
                           df_neg['text'].values[:sample_size]), axis=0) 
labels = [1]*sample_size + [0]*sample_size

In [4]:
#Get rid of all weird punctuation and extra lines

# preprocess_text('Hey! there.. I am Karan Tanwar')
# preprocess_text(raw_data[0])
for i in range(2*sample_size):
    raw_data[i] = preprocess_text(raw_data[i])

In [53]:
vectorizer = TfidfVectorizer(ngram_range = (1,2), max_df = 0.85, max_features = 1000)
vectorizer.fit(raw_data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.85, max_features=1000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [54]:
# summarize
# print(vectorizer.vocabulary_)
# print(vectorizer.idf_)

# encode document
# vector = vectorizer.transform([raw_data[2]])
# print(vector.shape)
# for i in range(vector.shape[1]):
#     if(vector.toarray()[0][i]!=0):
#         print('SEE! dont loose hope ', vector.toarray()[0][i])

In [55]:
vector = vectorizer.transform(raw_data)
# print(len(labels))
# print(len(vector.toarray()))
# vector

In [56]:
clf = GaussianNB()
clf.fit(vector.toarray(), labels)

GaussianNB(priors=None, var_smoothing=1e-09)

In [57]:
query = 'This is sad :('
# query = raw_data[sample_size]
clf.predict(vectorizer.transform([preprocess_text(query)]).toarray())[0]

1

In [58]:
df1 = pd.read_csv(test_file)
del df1['Id']
del df1['Date']
del df1['Query']
del df1['Username']
df_pos_test = df1[df1['Y']==0]
df_neg_test = df1[df1['Y']==4]

test_size = df_pos_test['text'].size + df_neg_test['text'].size

In [59]:
df1

Unnamed: 0,Y,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...
...,...,...
493,2,Ask Programming: LaTeX or InDesign?: submitted...
494,0,"On that note, I hate Word. I hate Pages. I hat..."
495,4,Ahhh... back in a *real* text editing environm...
496,0,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [60]:
raw_data_test = np.concatenate((df_pos_test['text'].values[:df_pos_test['text'].size], 
                           df_neg_test['text'].values[:df_neg_test['text'].size]), axis=0) 
labels_test = [1]*df_pos_test['text'].size + [0]*df_neg_test['text'].size

In [61]:
len(raw_data_test)

359

In [62]:
count_total = 0
count_true = 0
count_false = 0
for i in range(test_size):
    out = clf.predict(vectorizer.transform([preprocess_text(raw_data_test[i])]).toarray())[0]
    if(out==labels_test[i]):
        count_true += 1
print('Accuracy: ', count_true/test_size)

Accuracy:  0.7520891364902507


FOR TfidfVectorizer(ngram_range = (1,2), max_df = 0.85, max_features = 1000) | sample_size = 100000
Accuracy:  0.7520891364902507

In [63]:
import cPickle
with open('tfidf_classifier.pkl', 'wb') as fid:
    cPickle.dump(clf, fid)

ModuleNotFoundError: No module named 'cPickle'