In [4]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; float:center}</style>")

In [5]:
from __future__ import division, print_function
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentAnalyzer
porter = PorterStemmer()

## set styles, stopwords, define functions

In [6]:
stops = set(stopwords.words('english'))
    # using a set will make it faster to run through...

punctuation = ['.',',',':','!',';','-','?','"',"'",'(',')','—']   
other = ['ive','ve', "i've", "i'v", 'i’ll', 'i’ve', 'i’v']  # 'deb','hideb','don','didn','twaittry','doesn','thank','heydeb',
    
stops_punc = set(stopwords.words('english') + punctuation)

mystops = stopwords.words('english') + punctuation + other
mystops_set = set(stopwords.words('english') + punctuation + other)

## keyword dictionary for training

In [7]:
# subs = ['had trouble','had issue','exchange','recommend','change','easier','gluten free','gluten-free','vegan','vegetarian','easier','replace with','replac','adjust','suggest','swap','switch','instead of','in place of','substitute','replace','i use','to make it','customiz','adjust','instead','exchang','opted to use','better with']
# add = ['will ad','will do','next time','going to add','add more','better with','includ','i up','should have ad']
# omit = ['omit','leave out','remov','left out','eliminat','add less','forgot','decreas','dispensed with','delet','reduc','should have left out']

subs = ['exchange','recommend','easier','replace with','substitute','replace','i use','to make it','instead','exchang']

add = ['will ad','will do','next time','going to add','add more','better with','includ','i up','should have ad']

omit = ['omit','leave out','remov','eliminat','add less','forgot','decreas','should have left out']

In [8]:
def tokenize_sentences(comment):
    return nltk.sent_tokenize(remove_newlines(comment.lower()))
    
# slightly faster version
def separate_sentences(frame, identifier, paragraph, how='merge'):
    sentences = pd.DataFrame((tokenize_sentences(row[paragraph]) for _, row in frame.iterrows()), index=frame[identifier]).stack()
    sentences = sentences.reset_index() [[0, identifier]] # var1 variable is currently labeled 0
    sentences.columns = ['sentence', identifier] # renaming var1
    
    if how == 'merge':
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 
    elif how == 'nomerge': 
        return sentences
    else: 
        return sentences.merge(frame, left_on=identifier, right_on=identifier, how='outer', sort=False, suffixes=('_l','_r')) 


def make_lowercase(comment):
    return remove_newlines(comment.lower())

   
def tokenize(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return ' '.join([word for word in word_tokenize(remove_newlines(comment).lower())])

def ngram(comment):
    comment = remove_newlines(comment)
    if comment == []:
        return None
    else: 
        return [word for word in ngrams(remove_newlines(comment).lower().split(),gram)]
    
def tokenize_stem(comment):
    tokens = word_tokenize(remove_newlines(comment).lower())
    if tokens == []:
        return None
    else: 
        return ' '.join([porter.stem(word) for word in tokens])

def tokenize_stem_stops(comment):
    tokens = word_tokenize(remove_newlines(comment).lower())
    if tokens == []:
        return None
    else: 
        return ' '.join([porter.stem(word) for word in tokens if word not in mystops])


def remove_newlines(comment):    
    return re.sub(r"\n", " ", comment)


def preprocess_comments_data(frame):
    # make sure commentIDs are unique ( = row identity)
    frame.loc[:,'commentID'] = frame.index

    # remove any frame with no comment text
    frame = frame.loc[pd.notnull(frame['usercomment']),:]

    # replace NaN usernames with 'anon'
    frame.loc[:,'username'].fillna('anon', inplace=True)

    # tokenize data
    frame.loc[:,'usercomment'] = frame.loc[:,'usercomment'].apply(remove_newlines)
    frame.loc[:,'usercomment_lower'] = frame.loc[:,'usercomment'].apply(make_lowercase)

    frame.loc[:,'tokens'] = frame.loc[:,'usercomment'].apply(tokenize)
    frame.loc[:,'tokens_stemmed'] = frame.loc[:,'usercomment'].apply(tokenize_stem)
    
    frame.dropna(inplace=True)
    frame.drop([],axis=0, inplace=True)

    
    frame2 = separate_sentences(frame, 'commentID','usercomment',how='merge')
    frame2.dropna(inplace=True)
    frame2.drop([],axis=0, inplace=True)
    

    frame2.loc[:,'sentence_tokens'] = frame2.loc[:,'sentence'].apply(tokenize)
    frame2.loc[:,'sentence_tokens_stemmed_stops'] = frame2.loc[:,'sentence'].apply(tokenize_stem_stops)
    frame2.loc[:,'sentence_tokens_stemmed'] = frame2.loc[:,'sentence'].apply(tokenize_stem)
    
#     gram = 2
#     frame2.loc[:,'sentence_bigrams'] = frame2.loc[:,'sentence'].apply(ngram)
#     gram = 3
#     frame2.loc[:,'sentence_trigrams'] = frame2.loc[:,'sentence'].apply(ngram)
    
    frame2.dropna(inplace=True)
    frame2.drop([],axis=0, inplace=True)

    return frame, frame2 


import & sanity check

In [9]:
fname = 'training_data'
comments = pd.read_csv('/Users/kateliea/Documents/Insight/project/data/partial_SK_data/comments_only_100.csv', index_col=0)
comments.columns

Index(['child_id', 'children', 'commentID', 'comment_time', 'recipenumber',
       'title', 'url', 'usercomment', 'username', 'usersite',
       'usercomment_lower', 'tokens', 'tokens_stemmed'],
      dtype='object')

## make new dataframe with sentences separated, tokenize everything

In [10]:
comments_only, comments_with_sentences = preprocess_comments_data(comments)

In [11]:
comments_with_sentences.columns

Index(['sentence', 'commentID', 'child_id', 'children', 'comment_time',
       'recipenumber', 'title', 'url', 'usercomment', 'username', 'usersite',
       'usercomment_lower', 'tokens', 'tokens_stemmed', 'sentence_tokens',
       'sentence_tokens_stemmed_stops', 'sentence_tokens_stemmed'],
      dtype='object')

## classification of training data

In [12]:
comments_with_sentences['category'] = 'other'

for phrase in omit: 
    comments_with_sentences.loc[comments_with_sentences.sentence.str.contains(phrase) == True, 'category'] = 'omission'
    
for phrase in subs: 
    comments_with_sentences.loc[comments_with_sentences.sentence.str.contains(phrase) == True, 'category'] = 'substitution'
    
for phrase in add: 
    comments_with_sentences.loc[comments_with_sentences.sentence.str.contains(phrase) == True, 'category'] = 'addition'
    
print('other, %i; additions, %i; substitutions, %i; omissions, %i' % (comments_with_sentences[comments_with_sentences.category == 'other'].sentence.count(), comments_with_sentences[comments_with_sentences.category == 'addition'].sentence.count(), comments_with_sentences[comments_with_sentences.category == 'substitution'].sentence.count(), comments_with_sentences[comments_with_sentences.category == 'omission'].sentence.count()))

print(comments_with_sentences.columns)

other, 91915; additions, 1053; substitutions, 3942; omissions, 406
Index(['sentence', 'commentID', 'child_id', 'children', 'comment_time',
       'recipenumber', 'title', 'url', 'usercomment', 'username', 'usersite',
       'usercomment_lower', 'tokens', 'tokens_stemmed', 'sentence_tokens',
       'sentence_tokens_stemmed_stops', 'sentence_tokens_stemmed', 'category'],
      dtype='object')


In [13]:
comments_with_sentences = comments_with_sentences.dropna()

comments_with_sentences['category_simple'] = comments_with_sentences.loc[:,'category']
comments_with_sentences['category_simple'] = comments_with_sentences.category_simple.replace(['addition','substitution','omission'],'helpful')

comments_with_sentences.category_simple.unique()

array(['other', 'helpful'], dtype=object)

In [16]:
for name in ['sent_neg','sent_neu','sent_pos','sent_compound']:
    comments_with_sentences[name] = 0

for i, sentence in enumerate(comments_with_sentences.loc[:,'sentence']): 
    polarity = SentimentIntensityAnalyzer().polarity_scores(sentence)
    comments_with_sentences.loc[i, 'sent_neg'] = polarity['neg']
    comments_with_sentences.loc[i, 'sent_neu'] = polarity['neu']
    comments_with_sentences.loc[i, 'sent_pos'] = polarity['pos']
    comments_with_sentences.loc[i, 'sent_compound'] = polarity['compound']

In [None]:
def analyze_sentiment(sentence):
    polarity = SentimentIntensityAnalyzer().polarity_scores(sentence)
#     comments_with_sentences.loc[i, 'sent_neg'] = polarity['neg']
#     comments_with_sentences.loc[i, 'sent_neu'] = polarity['neu']
#     comments_with_sentences.loc[i, 'sent_pos'] = polarity['pos']
#     comments_with_sentences.loc[i, 'sent_compound'] = polarity['compound']
        return polarity

In [17]:
fname = 'training_data'
comments_with_sentences.to_csv(fname+'.csv') #, comments_only.to_csv(fname+'.csv')