In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; float:center}</style>")

In [2]:
from __future__ import division, print_function

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import decomposition
from sklearn.pipeline import Pipeline



%matplotlib inline
):
        sns.set_context('notebook', font_scale=1.5)
sns.set_context('notebook')
sns.set_style('ticks')

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
mystops = stopwords.words('english') + punctuation + other


import & sanity check

In [4]:
comments_with_sentences_100_classified = pd.read_csv('comments_with_sentences_100_classified_4classes.csv')
comments_with_sentences_100_classified.dropna(inplace=True)

In [6]:
comments_with_sentences_100_classified.category.unique()

array(['other', 'substitution', 'omission', 'addition'], dtype=object)

## process test & train data

### define data sets

In [9]:
X_train = comments_with_sentences_100_classified.sentence_tokens_stemmed[:50000] #\.as_matrix()
target_train = comments_with_sentences_100_classified.category[:50000] #.as_matrix()
target_train.replace(['addition','substitution','omission'],'helpful',inplace=True)


X_test = comments_with_sentences_100_classified.loc[5000:, 'sentence_tokens_stemmed']
target_test = comments_with_sentences_100_classified.category[50000:]
target_test.replace(['addition','substitution','omission'],'helpful',inplace=True)

## pipeline

In [12]:
classifier_pipe0 = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,3), stop_words=mystops, max_df=50, min_df=5)),
                     ('transformer', TfidfTransformer(use_idf=True)),
                     ('classifier', MultinomialNB())])

## word relevancy - counter, tf-idf

In [13]:
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words=mystops, max_df=50, min_df=5)

X_train_counts = vectorizer.fit_transform(X_train)
X_train_counts.shape

transformer = TfidfTransformer(use_idf=True)

X_train_tf = transformer.fit_transform(X_train_counts)
X_train_tf.shape

(50000, 9665)

just to try to get some sense of the frequencies

In [None]:
X_train_tf.toarray()[:,vectorizer.vocabulary_.get('suggest')].sum()

## classifier

train

In [18]:
classifier = MultinomialNB().fit(X_trans, target)

# classifier = BernoulliNB().fit(X_train_counts, target)

ValueError: Input X must be non-negative

test

In [None]:
X_test_counts = vectorizer.transform(X_test)
X_test_tf = transformer.transform(X_test_counts)

In [None]:
predicted = classifier.predict(X_test_tf)

# for doc, category in zip(X_test, predicted):
# #     if category == 'addition':
#     if category != 'other':
#         print('%r => %s' % (doc, category))
        
print(len(predicted[predicted != 'other']))
print(len(predicted[predicted == 'helpful']))

# for doc, category in zip(X_test[:100], predicted[:100]):
#     print('%r => %s' % (doc, category))

check

In [None]:
np.mean(predicted == target_test) 

In [None]:
for doc, category in zip(X_test[:3000], predicted[:3000]):
#     if category == 'addition':
    if category != 'other':
        print('%r => %s' % (doc, category))


re-fit with test and train data 

In [None]:
classifier = MultinomialNB().fit(X_test_tf, predicted)
# predicted = classifier.predict(X_test_tf)

## PCA 

In [14]:
pca = decomposition.PCA(n_components=15)

pca.fit(X_train_tf.toarray())
X_trans = pca.transform(X_train_tf.toarray())

In [16]:
X_trans.shape

(50000, 15)

In [17]:
X_train_tf.shape

(50000, 9665)