In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; float:center}</style>")

In [2]:
from __future__ import division, print_function

import os
import glob
import math
import re
from collections import Counter
import datetime
from itertools import groupby

import pandas as pd
import numpy as np
import sklearn as skl
# import scipy as sp
# import scipy.stats as sps
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud as wc


import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
# from nltk import Text

import time

In [3]:
porter = PorterStemmer()

%matplotlib inline

sns.set_context('notebook')
sns.set_style('ticks')

In [4]:
stops = set(stopwords.words('english'))
    # using a set will make it faster to run through...

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
    
stops_punc = set(stopwords.words('english') + punctuation)

mystops = stopwords.words('english') + punctuation + other
mystops_set = set(stopwords.words('english') + punctuation + other)

In [5]:
def tokenize_sentences(comment):
    return nltk.sent_tokenize(remove_newlines(comment.lower()))
    
    
# def separate_sentences(frame, identifier, paragraph, how='merge'):
#     sentences = pd.concat([pd.Series(row[identifier], tokenize_sentences((row[paragraph]))) for _, row in frame.iterrows()]).reset_index()
#     sentences.columns = ['sentence', identifier]
    
#     if how == 'merge':
#         return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
#     elif how == 'nomerge': 
#         return sentences
#     else: 
#         return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 

# slightly faster version
def separate_sentences(frame, identifier, paragraph, how='merge'):
    sentences = pd.DataFrame((tokenize_sentences(row[paragraph]) for _, row in frame.iterrows()), index=frame[identifier]).stack()
    sentences = sentences.reset_index()[[0, identifier]] # var1 variable is currently labeled 0
    sentences.columns = ['sentence', identifier] # renaming var1
    
    if how == 'merge':
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
    elif how == 'nomerge': 
        return sentences
    else: 
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 


def make_lowercase(comment):
    return remove_newlines(comment.lower())


# def tokenize_aslist(comment):
#     comment = remove_newlines(comment)
#     if comment == []:
#         return None
#     else: 
#         return [word for word in word_tokenize(remove_newlines(comment).lower()) if word not in mystops]

    
def tokenize(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return ' '.join([word for word in word_tokenize(remove_newlines(comment).lower()) if word not in mystops])

    
# def tokenize_stem_aslist(comment):
#     tokens = word_tokenize(remove_newlines(comment).lower())
#     if tokens == []:
#         return None
#     else: 
#         return[porter.stem(word) for word in tokens if word not in mystops]
    
def tokenize_stem(comment):
    tokens = word_tokenize(remove_newlines(comment).lower())
    if tokens == []:
        return None
    else: 
        return ' '.join([porter.stem(word) for word in tokens if word not in mystops])


def remove_newlines(comment):    
    return re.sub(r"\n", " ", comment)

import & sanity check

a bit of cleaning - remove any comments where there is not comment text

In [None]:
comments = pd.read_csv('/Users/kateliea/Documents/Insight/project/webscrapers/comments_smittenkitchen_100.csv', index_col=0)

In [8]:
comments['commentID'] = comments.index

# remove any comments with no comment text
comments = comments[pd.notnull(comments.usercomment)]

# replace NaN usernames with 'anon'
comments['username'].fillna('anon', inplace=True)

# sanity check
# recipes.numberofcomments.sum(), len(comments), len(recipes)

### make new dataframe with sentences separated

In [None]:
def preprocess_comments_data(frame):
    # make sure commentIDs are unique ( = row identity)
    frame['commentID'] = frame.index

    # remove any frame with no comment text
    frame = frame[pd.notnull(frame.usercomment)]

    # replace NaN usernames with 'anon'
    frame['username'].fillna('anon', inplace=True)

    # 
    frame_sentences = separate_sentences(frame, 'commentID','usercomment',how='merge')
    frame_sentences = frame_sentences.dropna()
    frame_sentences = frame_sentences.drop([],axis=0)
    
    # tokenize data
    frame_sentences['usercomment'] = frame_sentences.usercomment.apply(remove_newlines)
    frame_sentences['usercomment_lower'] = frame_sentences.usercomment.apply(make_lowercase)

    frame_sentences['tokens'] = frame_sentences.usercomment.apply(tokenize)
    frame_sentences['tokens_stemmed'] = frame_sentences.usercomment.apply(tokenize_stem)
    
    frame_sentences['sentence_tokens'] = frame_sentences.sentence.apply(tokenize)
    frame_sentences['sentence_tokens_stemmed'] = frame_sentences.sentence.apply(tokenize_stem)
    frame_sentences = frame_sentences.dropna()
    frame_sentences = frame_sentences.drop([],axis=0)

#     frame_sentences['sentence_tokens_aslist'] = frame_sentences.sentence.apply(tokenize_aslist)
#     frame_sentences = frame_sentences.dropna()
#     frame_sentences = frame_sentences.drop([],axis=0)

#     frame_sentences['sentence_tokens_stemmed_aslist'] = frame_sentences.sentence.apply(tokenize_stem_aslist)
#     frame_sentences = frame_sentences.dropna()
#     frame_sentences = frame_sentences.drop([],axis=0)



### tokenize 

In [16]:
comments_sentences = pd.read_csv('comments_classified_SK_filtered2000.csv', index_col=0)

In [18]:
comments_sentences.shape, comments.columns

((1999, 15),
 Index(['category', 'sentence', 'commentID', 'child_id', 'children',
        'comment_time', 'recipenumber', 'title', 'url', 'usercomment',
        'username', 'usersite', 'usercomment_lower', 'tokens',
        'tokens_stemmed'],
       dtype='object'))

In [19]:
comments_sentences['sentence_tokens_aslist'] = comments_sentences.sentence.apply(tokenize_aslist)
comments_sentences = comments_sentences.dropna()
comments_sentences = comments_sentences.drop([],axis=0)

comments_sentences['sentence_tokens_stemmed_aslist'] = comments_sentences.sentence.apply(tokenize_stem_aslist)
comments_sentences = comments_sentences.dropna()
comments_sentences = comments_sentences.drop([],axis=0)

comments_sentences['sentence_tokens'] = comments_sentences.sentence.apply(tokenize)
comments_sentences = comments_sentences.dropna()
comments_sentences = comments_sentences.drop([],axis=0)

comments_sentences['sentence_tokens_stemmed'] = comments_sentences.sentence.apply(tokenize_stem)
comments_sentences = comments_sentences.dropna()
comments_sentences = comments_sentences.drop([],axis=0)

## word relevancy - tf-idf

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [22]:
X_train = comments_sentences.sentence_tokens_stemmed.as_matrix()
target = comments_sentences.category.as_matrix()

In [23]:
vectorizer = CountVectorizer()

X_train_counts = vectorizer.fit_transform(X_train)
X_train_counts.shape

(1999, 2456)

In [24]:
transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)

X_train_tf = transformer.transform(X_train_counts)
X_train_tf.shape

(1999, 2456)

In [25]:
target.shape

(1999,)

## classifier

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
classifier = MultinomialNB().fit(X_train_tf, target)

In [32]:
X_test = comments_sentences.sentence_tokens_stemmed[:1999].as_matrix()

In [None]:
comments_classified.head()

## PCA 

In [None]:
from sklearn import decomposition

In [None]:
X = comments.tokens_stemmed

pca = decomposition.PCA(n_components=10)

pca.fit(X)
X_trans = pca.transform(X)