# Init

## Import necessary packages

In [40]:
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
import numpy as np

stopwords = nltk.corpus.stopwords.words('english')
# for special words you don't want to lemmatize
nolem = ['sas','python']

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

## Define Step by step text cleaning function

In [41]:
def clean_text(text, join=False, stem='lemmatize'):
    # lower case and remove punctuation, here word is char so need '' to join these char back
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    
    # break down to tokens (single word)
    tokens = re.split('\W+', text)
    
    # special words no lemmatize, and remove stop words
    # elif list comprehension: [if-1 condition_1 else_2 for_statement <condition_2>]
    
    if stem == 'lemmatize':
        if join == True:
            text = ' '.join([word if word in nolem else wn.lemmatize(word) for word in tokens if word not in stopwords ])
        else:
            text = [word if word in nolem else wn.lemmatize(word) for word in tokens if word not in stopwords ]
    elif stem == 'stem':
        if join == True:
            text = ' '.join([word if word in nolem else ps.stem(word) for word in tokens if word not in stopwords ])
        else:
            text = [word if word in nolem else ps.stem(word) for word in tokens if word not in stopwords ]
    #nostop = [word for word in tokens if word not in stopwords]
    #text = [wn.lemmatize(word) if word not in nolem else word for word in nostop]
    #text=[]
    #for word in tokens:
    #    if word in nolem:
    #        text.append(word)
    #    elif word not in stopwords:
    #        text.append(wn.lemmatize(word))
    return text

# Load data via Mongo

In [42]:
import pymongo
from pymongo import MongoClient
import pprint
from bson.son import SON

In [43]:
# create connection
client = MongoClient()
db = client.Yelp

In [44]:
# An example to print one record. The dictionary{} is optional.
pprint.pprint(db.review.find_one({'stars' : 4}))

{'_id': ObjectId('5bb546f235e9995049a63b7e'),
 'business_id': 'yEOu75XjwczngvWWlr0M_A',
 'cool': 0,
 'date': '2016-02-17',
 'funny': 0,
 'review_id': '-STQDcMcBcWX0H_NrsfR2Q',
 'stars': 4,
 'text': 'Great for the price, only $25 for long hair! Was hesitant about dry '
         'cut but it turned out good!',
 'useful': 1,
 'user_id': 'LKcdcMrq2xDzIThSHiZmLg'}


In [45]:
print('There are',len(db.business.distinct('business_id')),'businesses in',len(db.business.distinct('city')),'cities')

There are 188593 businesses in 1111 cities


## Aggregation in Mongo

In [46]:
# 1st step create pipeline
pipeline = [
    {"$unwind": "$city"},
    {"$group": {"_id": "$city",
                "count": {"$sum": 1}}
    },
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

In [47]:
# print result
pprint.pprint(list(db.business.aggregate(pipeline)))

[{'_id': 'Las Vegas', 'count': 28865},
 {'_id': 'Phoenix', 'count': 18633},
 {'_id': 'Toronto', 'count': 18233},
 {'_id': 'Charlotte', 'count': 9204},
 {'_id': 'Scottsdale', 'count': 8822},
 {'_id': 'Calgary', 'count': 7384},
 {'_id': 'Pittsburgh', 'count': 6804},
 {'_id': 'Mesa', 'count': 6239},
 {'_id': 'Montréal', 'count': 6045},
 {'_id': 'Henderson', 'count': 4815},
 {'_id': 'Tempe', 'count': 4492},
 {'_id': 'Chandler', 'count': 4272},
 {'_id': 'Madison', 'count': 3509},
 {'_id': 'Cleveland', 'count': 3506},
 {'_id': 'Glendale', 'count': 3469},
 {'_id': 'Gilbert', 'count': 3397},
 {'_id': 'Mississauga', 'count': 2954},
 {'_id': 'Peoria', 'count': 1868},
 {'_id': 'Markham', 'count': 1699},
 {'_id': 'North Las Vegas', 'count': 1508},
 {'_id': 'Champaign', 'count': 1243},
 {'_id': 'Scarborough', 'count': 1175},
 {'_id': 'North York', 'count': 1140},
 {'_id': 'Surprise', 'count': 1119},
 {'_id': 'Richmond Hill', 'count': 978},
 {'_id': 'Concord', 'count': 975},
 {'_id': 'Brampton', 'co

In [47]:
# The result of lmit() is a cursor object.
# You have to use for loop to print out the cursor.

# for data in db.review.find().limit(2):
#     pprint.pprint(data)

In [48]:
# reading Mongo result into pandas
df=pd.DataFrame(list(db.review.find().limit(1000)))

In [49]:
df.shape

(1000, 10)

In [50]:
df.head()

Unnamed: 0,_id,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,5bb546f235e9995049a63b77,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g
1,5bb546f235e9995049a63b78,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g
2,5bb546f235e9995049a63b79,elqbBhBfElMNSrjFqW3now,0,2011-02-25,0,Er4NBWCmCD4nM8_p1GRdow,2,Back in 2005-2007 this place was my FAVORITE t...,2,msQe1u7Z_XuqjGoqhB0J5g
3,5bb546f235e9995049a63b7a,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g
4,5bb546f235e9995049a63b7b,yFumR3CWzpfvTH2FCthvVw,0,2016-06-15,0,STiFMww2z31siPY7BWNC2g,5,I have been an Emerald Club member for a numbe...,0,TlvV-xJhmh7LCwJYXkV-cg


# Clean Text with ntlk package

## Clean text

In [14]:
df['review']=df['text'].apply(clean_text)

# Apply CountVectorizer

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(df['text'])
print(X_counts.shape)
print(count_vect.get_feature_names())
print(X_counts)

# Vectorizers output sparse matrices
X_counts_df=pd.DataFrame(X_counts.toarray())
X_counts_df.columns = count_vect.get_feature_names()
X_counts_df.head()

(1000, 9237)
  (0, 2432)	1
  (0, 7052)	1
  (0, 2799)	1
  (0, 1608)	1
  (0, 8515)	1
  (0, 6794)	1
  (0, 8261)	1
  (0, 1438)	1
  (0, 5440)	1
  (0, 7971)	1
  (0, 4692)	1
  (0, 7655)	1
  (0, 4880)	1
  (0, 3653)	1
  (0, 677)	1
  (0, 1448)	1
  (0, 503)	2
  (0, 3289)	1
  (0, 4037)	1
  (0, 6203)	1
  (0, 8970)	1
  (0, 714)	1
  (0, 3572)	1
  (0, 3126)	1
  (0, 6093)	1
  :	:
  (999, 7992)	1
  (999, 9146)	2
  (999, 8059)	2
  (999, 8886)	2
  (999, 8516)	1
  (999, 1596)	1
  (999, 2896)	1
  (999, 7651)	1
  (999, 2424)	1
  (999, 5692)	2
  (999, 6310)	1
  (999, 1894)	1
  (999, 8201)	1
  (999, 9125)	1
  (999, 9128)	1
  (999, 7220)	1
  (999, 3611)	2
  (999, 2901)	1
  (999, 3520)	1
  (999, 2575)	1
  (999, 5958)	2
  (999, 3580)	1
  (999, 3289)	4
  (999, 3572)	1
  (999, 6093)	2


Unnamed: 0,Unnamed: 1,0,025,063,070,0700,099,1,10,100,...,zio,zippoooo,zipps,zizzy,zone,zoot,zteca,zucchini,zz,à
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Apply CountVectorizer (w/ N-Grams)

In [58]:
ngram_vect = CountVectorizer(ngram_range=(1,2))
X_counts_ngram=ngram_vect.fit_transform(df['text'])
print(X_counts_ngram.shape)
print(ngram_vect.get_feature_names())

X_counts_ngram_df=pd.DataFrame(X_counts_ngram.toarray())
X_counts_ngram_df.columns=ngram_vect.get_feature_names()
X_counts_ngram_df.head()

(1000, 76609)


Unnamed: 0,00,00 add,00 along,00 delivery,00 dish,00 figuring,00 for,00 found,00 icees,00 it,...,zoot,zoot suits,zucchini,zucchini fries,zucchini fritti,zucchini hot,zucchini light,zucchini red,zz,zz ward
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
ngram_vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

# Apply CountVectorizer TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())
X_tfidf_df=pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns=tfidf_vect.get_feature_names()
X_tfidf_df.head()

(1000, 9237)


Unnamed: 0,Unnamed: 1,0,025,063,070,0700,099,1,10,100,...,zio,zippoooo,zipps,zizzy,zone,zoot,zteca,zucchini,zz,à
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature Engineering: Feature Creation

# Create feature for text message length

# Create feature for % of text that is punctuation

In [11]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(' ')), 3) * 100
tipsdf['punct%']=tipsdf['TIPS'].apply(lambda x: count_punct(x))

In [12]:
tipsdf.head()

Unnamed: 0,TIPS,label,year,TIPS_len,punct%
0,Don't be afraid to ask a ton of questions.,Ask Questions,2018,34,5.9
1,"More specific questions yield more helpful information. When you ask for clarification, try to f...",Ask Questions,2018,158,2.5
2,Always ask questions in class. It's easier to clear it up while you're learning it rather than t...,Ask Questions,2018,102,3.9
3,Ask questions! Not only will this help you learn material faster but learning to ask the right q...,Ask Questions,2018,157,1.3
4,Make friends with people around and be a nice person. Enjoy your time at IAA.,Building Relationships/Teamwork,2018,63,3.2


# Evaluate created features

In [10]:
tipsdf['TIPS_len']=tipsdf['TIPS'].apply(lambda x: len(x) - x.count(' '))

In [39]:
from matplotlib import pyplot
%matplotlib inline

In [14]:
'''
bins=np.linspace(0,200,40)

pyplot.hist(tipsdf['TIPS_len'],bins, alpha=0.5, normed=True)
#pyplot.hist(tipsdf[''],bins)
'''

"\nbins=np.linspace(0,200,40)\n\npyplot.hist(tipsdf['TIPS_len'],bins, alpha=0.5, normed=True)\n#pyplot.hist(tipsdf[''],bins)\n"

In [15]:
'''
bins=np.linspace(0,50,40)

pyplot.hist(tipsdf['punct%'],bins, alpha=0.5, normed=True, label='punct%')
pyplot.legend(loc='upper right')
pyplot.show()
'''

"\nbins=np.linspace(0,50,40)\n\npyplot.hist(tipsdf['punct%'],bins, alpha=0.5, normed=True, label='punct%')\npyplot.legend(loc='upper right')\npyplot.show()\n"

# Feature Engineering: Transformations

In [16]:
'''
for i in [1,2,3,4,5]:
    pyplot.hist(tipsdf['punct%']**(1/i), bins=40)
    pyplot.title('Transformation: 1/{}'.format(i))
    pyplot.show()
'''

"\nfor i in [1,2,3,4,5]:\n    pyplot.hist(tipsdf['punct%']**(1/i), bins=40)\n    pyplot.title('Transformation: 1/{}'.format(i))\n    pyplot.show()\n"

# Machine Learning!

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_estimator_type', '_get_param_names', '_make_estimator', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0,

In [41]:
tfidf_vect=TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(df['text'])

In [42]:
X_features=pd.concat([df['text'], tipsdf['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

NameError: name 'tipsdf' is not defined

In [20]:
from sklearn.model_selection import KFold, cross_val_score

In [21]:
# run in parellel
rf=RandomForestClassifier(n_jobs=-1)
k_fold=KFold(n_splits=10)
cross_val_score(rf, X_features, tipsdf['label'], cv=k_fold, n_jobs=-1)

array([0.5       , 0.64545455, 0.38181818, 0.63636364, 0.46363636,
       0.4       , 0.31818182, 0.50909091, 0.36363636, 0.29090909])

In [22]:
tipsdf['TIPS_cleaned']=tipsdf['TIPS'].apply(lambda x: clean_text(x))

In [23]:
# using set-list to convert a unique list

def ulist(lists):
    return list(set(lists))

tipsdf['TIPS_unique']=tipsdf['TIPS_cleaned'].apply(lambda x: ulist(x))
#tipsdf['TIPS_ngram_unique']=tipsdf['TIPS_ngrams'].apply(lambda x: ulist(x))

In [24]:
# create dataframe of word frequency for each year

yearwordlist={}
for year in tipsyear:
    wordlist= {}
    for tips in tipsdf[tipsdf['year']==year]['TIPS_unique']:
        for word in tips:
            wordlist[word] = wordlist.get(word,0) + 1
    yearwordlist[year]=wordlist
    
yeardf=pd.DataFrame(yearwordlist)

In [24]:
language1=language.divide(year_people, axis=0)

# Sentiment

In [63]:
from sentiment_module import sentiment
term = 'happy'
print(sentiment.exist( term ))
print(sentiment.sentiment( term )

True
{'valence': 8.21, 'arousal': 6.49}


In [66]:
df.sentiment.sentiment("it was the best of times it was the worst of times".split())

{'valence': 5.0307617694606375, 'arousal': 4.939546556471719}