In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; float:center}</style>")

In [2]:
from __future__ import division, print_function

import os
import glob
import math
import re
from collections import Counter
import datetime
from itertools import groupby

import pandas as pd
import numpy as np
import sklearn as skl
# import scipy as sp
# import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud as wc


import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
# from nltk import Text

import time

porter = PorterStemmer()

%matplotlib inline

sns.set_context('notebook')
sns.set_style('ticks')

### set styles, stopwords, define functions

In [3]:
stops = set(stopwords.words('english'))
    # using a set will make it faster to run through...

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
    
stops_punc = set(stopwords.words('english') + punctuation)

mystops = stopwords.words('english') + punctuation + other
mystops_set = set(stopwords.words('english') + punctuation + other)

In [4]:
def tokenize_sentences(comment):
    return nltk.sent_tokenize(remove_newlines(comment.lower()))
    
    
# def separate_sentences(frame, identifier, paragraph, how='merge'):
#     sentences = pd.concat([pd.Series(row[identifier], tokenize_sentences((row[paragraph]))) for _, row in frame.iterrows()]).reset_index()
#     sentences.columns = ['sentence', identifier]
    
#     if how == 'merge':
#         return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
#     elif how == 'nomerge': 
#         return sentences
#     else: 
#         return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 

# slightly faster version
def separate_sentences(frame, identifier, paragraph, how='merge'):
    sentences = pd.DataFrame((tokenize_sentences(row[paragraph]) for _, row in frame.iterrows()), index=frame[identifier]).stack()
    sentences = sentences.reset_index() [[0, identifier]] # var1 variable is currently labeled 0
    sentences.columns = ['sentence', identifier] # renaming var1
    
    if how == 'merge':
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
    elif how == 'nomerge': 
        return sentences
    else: 
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 


def make_lowercase(comment):
    return remove_newlines(comment.lower())


# def tokenize_aslist(comment):
#     comment = remove_newlines(comment)
#     if comment == []:
#         return None
#     else: 
#         return [word for word in word_tokenize(remove_newlines(comment).lower()) if word not in mystops]

    
def tokenize(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return ' '.join([word for word in word_tokenize(remove_newlines(comment).lower()) if word not in mystops])

def ngram(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return [word for word in ngrams(remove_newlines(comment).lower().split(),gram) if word not in mystops]

    
# def tokenize_stem_aslist(comment):
#     tokens = word_tokenize(remove_newlines(comment).lower())
#     if tokens == []:
#         return None
#     else: 
#         return[porter.stem(word) for word in tokens if word not in mystops]
    
def tokenize_stem(comment):
    tokens = word_tokenize(remove_newlines(comment).lower())
    if tokens == []:
        return None
    else: 
        return ' '.join([porter.stem(word) for word in tokens if word not in mystops])


def remove_newlines(comment):    
    return re.sub(r"\n", " ", comment)



def preprocess_comments_data(frame):
    # make sure commentIDs are unique ( = row identity)
    frame.loc[:,'commentID'] = frame.index

    # remove any frame with no comment text
    frame = frame.loc[pd.notnull(frame['usercomment']),:]

    # replace NaN usernames with 'anon'
    frame.loc[:,'username'].fillna('anon', inplace=True)

    # 
    frame2 = separate_sentences(frame, 'commentID','usercomment',how='merge')
    frame2.dropna(inplace=True)
    frame2.drop([],axis=0, inplace=True)
    
    # tokenize data
    frame2.loc[:,'usercomment'] = frame2.loc[:,'usercomment'].apply(remove_newlines)
    frame2.loc[:,'usercomment_lower'] = frame2.loc[:,'usercomment'].apply(make_lowercase)

    frame2.loc[:,'tokens'] = frame2.loc[:,'usercomment'].apply(tokenize)
    frame2.loc[:,'tokens_stemmed'] = frame2.loc[:,'usercomment'].apply(tokenize_stem)
    
    frame2.loc[:,'sentence_tokens'] = frame2.loc[:,'sentence'].apply(tokenize)
    frame2.loc[:,'sentence_tokens_stemmed'] = frame2.loc[:,'sentence'].apply(tokenize_stem)
    
    gram = 2
    comments_classified.loc[:,'sentence_bigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)
    gram = 3
    comments_classified.loc[:,'sentence_trigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)
    
    frame2.dropna(inplace=True)
    frame2.drop([],axis=0, inplace=True)

#     frame2['sentence_tokens_aslist'] = frame2.sentence.apply(tokenize_aslist)
#     frame2 = frame2.dropna()
#     frame2 = frame2.drop([],axis=0)

#     frame2['sentence_tokens_stemmed_aslist'] = frame2.sentence.apply(tokenize_stem_aslist)
#     frame2 = frame2.dropna()
#     frame2 = frame2.drop([],axis=0)

    return frame, frame2 


import & sanity check

In [None]:
comments = pd.read_csv('/Users/kateliea/Documents/Insight/project/webscrapers/comments_smittenkitchen_100.csv', index_col=0)

In [None]:
comments.columns

In [None]:
comments_classified = pd.read_csv('comments_classified_SK_filtered2000.csv', index_col=0)

In [5]:
comments_only.shape

NameError: name 'comments_only' is not defined

## make new dataframe with sentences separated, tokenize everything

In [None]:
comments_only, comments_with_sentences = preprocess_comments_data(comments)

In [None]:
comments_with_sentences.to_csv('comments_with_sentences.csv'), comments_only.to_csv('comments_only.csv')

In [None]:
gram = 2
comments_classified.loc[:,'sentence_bigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)
gram = 3
comments_classified.loc[:,'sentence_trigrams'] = comments_classified.loc[:,'sentence'].apply(ngram)

comments_classified.loc[:,'sentence_tokens'] = comments_classified.loc[:,'sentence'].apply(tokenize)
comments_classified.loc[:,'sentence_tokens_stemmed'] = comments_classified.loc[:,'sentence'].apply(tokenize_stem)

comments_classified.dropna(inplace=True)
comments_classified.drop([],axis=0, inplace=True)

comments_classified.to_csv('comments_classified_SK_filtered2000.csv')

In [None]:
comments_classified.head()

## define test, train data

In [None]:
comments_classified.to_csv('comments_classified_SK_filtered2000_additional.csv')

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
comments_classified.loc[:,'category_label'] = le.fit_transform(comments_classified.category)

In [None]:
comments_classified.category.unique()
# comments_classified.category.replace('try','other',inplace=True)
# comments_classified.category.replace('addition','suggestion',inplace=True)
# comments_classified.category.replace('subtraction','substitution',inplace=True)
# comments_classified.category.replace('related','other',inplace=True)
# comments_classified.category.replace('question','other',inplace=True)

In [None]:
X_train = comments_classified[comments_classified.category != 'other'].sentence #\.as_matrix()
# target = comments_classified.category.as_matrix()
target = comments_classified[comments_classified.category != 'other'].category_label #.as_matrix()

In [None]:
X_test = comments_with_sentences.loc[2000:, 'sentence']

In [None]:
list(le.classes_)

In [None]:
le.inverse_transform([2,1,0])

## word relevancy - counter, tf-idf

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words=mystops)

X_train_counts = vectorizer.fit_transform(X_train)
X_train_counts.shape

In [None]:
transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = transformer.transform(X_train_counts)
X_train_tf.shape

## classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

train

In [None]:
classifier = BernoulliNB().fit(X_train_counts, target)

test

In [None]:
X_test_counts = vectorizer.transform(X_test)
X_test_tf = transformer.transform(X_test_counts)

In [None]:
predicted = classifier.predict(X_test_counts)

In [None]:
for doc, category in zip(X_test, predicted):
    print('%r => %s' % (doc, category) if category != 'other')

In [None]:
predicted

In [None]:
comments_classified.head()

## PCA 

In [None]:
from sklearn import decomposition

In [None]:
pca = decomposition.PCA(n_components=10)

pca.fit(X_train_count)
X_trans = pca.transform(X_train_count)

In [None]:
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)


In [None]:
x,y = encode_target(comments_classified, 'category')

In [None]:
comments_classified.loc[comments_classified.sentence.str.contains('instead of') == True, 'category'] = 'substitution'

# comments_classified

## Decision tree classifier

In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_classifier = DecisionTreeClassifier(max_depth=3, max_features=10)

In [None]:
comments_classified.shape

In [None]:
comments.columns