In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; float:center}</style>")

In [13]:
from __future__ import division, print_function

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer



%matplotlib inline

sns.set_context('notebook')
sns.set_style('ticks')

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
mystops = stopwords.words('english') + punctuation + other


import & sanity check

In [3]:
comments_classified = pd.read_csv('comments_classified_SK_filtered2000_additional.csv',index_col=0)
comments_with_sentences = pd.read_csv('comments_with_sentences_100.csv',index_col=0)
comments_only = pd.read_csv('comments_only_100.csv',index_col=0)

comments_with_sentences_all = pd.read_csv('comments_with_sentences_all.csv',index_col=0)
comments_only_all = pd.read_csv('comments_only_all.csv',index_col=0)

In [4]:
len(comments_only_all.title.unique())

983

'sentence contains' rules:  
**substitution / subtraction** 
* suggest
* I use
* instead of
* do differently 
* left out 
* leave out
* omitted
* 

**suggestion / addition** 
* i add
* i include
* will add
* suggestion
* 






In [5]:
comments_only.columns

Index(['child_id', 'children', 'commentID', 'comment_time', 'recipenumber',
       'title', 'url', 'usercomment', 'username', 'usersite',
       'usercomment_lower', 'tokens', 'tokens_stemmed'],
      dtype='object')

## process test & train data

### define data sets

In [6]:
comments_with_sentences = comments_with_sentences.dropna()

In [7]:
comments_classified.category.unique()
# comments_classified.category.replace('try','other',inplace=True)
# comments_classified.category.replace('addition','suggestion',inplace=True)
# comments_classified.category.replace('subtraction','substitution',inplace=True)
# comments_classified.category.replace('related','other',inplace=True)
# comments_classified.category.replace('question','other',inplace=True)

array(['other', 'love', 'substitution', 'suggestion'], dtype=object)

In [9]:
# encode classes labels as ints
# though this probably doesn't matter for this? 
le = LabelEncoder()
comments_classified.loc[:,'category_label'] = le.fit_transform(comments_classified.category)

list(le.classes_)

['love', 'other', 'substitution', 'suggestion']

In [10]:
X_train = comments_classified.tokens_stemmed #\.as_matrix()
target = comments_classified.category #.as_matrix()

X_test = comments_only.loc[:, 'tokens_stemmed']
# X_test = comments_with_sentences.loc[:, 'tokens_stemmed']

# X_train = comments_classified[comments_classified.category != 'other'].sentence #\.as_matrix()
# target = comments_classified[comments_classified.category != 'other'].category_label #.as_matrix()

## word relevancy - counter, tf-idf

In [34]:
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words=mystops, min_df=1)

X_train_counts = vectorizer.fit_transform(X_train)
X_train_counts.shape

transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = transformer.transform(X_train_counts)
X_train_tf.shape

(1999, 22330)

just to try to get some sense of the frequencies

In [41]:
X_train_counts.toarray()[:,vectorizer.vocabulary_.get('left out')].sum()

160582

## classifier

train

In [None]:
classifier = MultinomialNB().fit(X_train_tf, target)

test

In [None]:
X_test_counts = vectorizer.transform(X_test)
X_test_tf = transformer.transform(X_test_counts)

In [None]:
predicted = classifier.predict(X_test_tf)

In [None]:
for doc, category in zip(X_test, predicted):
    if category == 'addition':
#     if category != 'other':
        print('%r => %s' % (doc, category))

# for doc, category in zip(X_test[:100], predicted[:100]):
#     print('%r => %s' % (doc, category))

In [None]:
classifier = MultinomialNB().fit(X_test_tf, predicted)

In [None]:
comments_classified.head()

## PCA 

In [None]:
from sklearn import decomposition

In [None]:
pca = decomposition.PCA(n_components=10)

pca.fit(X_train_tf)
X_trans = pca.transform(X_train_tf)

In [None]:
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)


In [None]:
x,y = encode_target(comments_classified, 'category')

In [None]:
comments_classified.loc[comments_classified.sentence.str.contains('instead of') == True, 'category'] = 'substitution'

# comments_classified

## Decision tree classifier

In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_classifier = DecisionTreeClassifier(max_depth=3, max_features=10)