# NLP

In [8]:
import re
from string import punctuation, digits, ascii_lowercase

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import (train_test_split,cross_val_score,
                                      StratifiedShuffleSplit)
                                      
%matplotlib inline

In [9]:
reviews = pd.read_csv('yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.csv')

In [10]:
samp = reviews.sample(n=10000)

Define lists of escape sequences, digits, stopwords to use for parseing. Also defines the type of stemming to be used. 

In [11]:
escapes = ''.join([chr(char) for char in range(1, 32)])
removeables = escapes + digits 
stops = [str(word) for word in stopwords.words('english')] + list(ascii_lowercase)
sno = nltk.stem.SnowballStemmer('english')

Create function to process the text and then use it to make a new coloumn in the dataframe where all texted has been processed. 

In [12]:
def parse_text(text, stem=True):
    ''' This function takes a review string and removes all escape sequences,
        digits, punctuation, http links, and stop words. Furthermore, every
        word in the string will be stemmed using nltk's snowball stemmer.
        Every word is also transformed to be lowercase.'''
    
    text = re.sub(r"http\S+", " ", text)
    regex = re.compile('[%s]' % re.escape(punctuation))
    text = regex.sub(' ', text)
    text = text.translate(None, removeables)
    text = text.decode('utf8')
    if stem == True:
        text = ' '.join([sno.stem(word.lower()) for word in text.split() if word.lower() not in set(stops)])
    else:
        text = ' '.join([word.lower() for word in text.split() if word.lower() not in set(stops)])
    return text

In [13]:
samp['parsed_text']=samp.text.apply(parse_text,stem=False)

### Bag of words, tf-idf vectorization
Create the bag of words representation. Find counts of each word in document and in whole courpus. Then create the tfidf representation. Worth also considering binary count vecorizing, supposed to work better for smaller sample sets.

In [14]:
corpus = samp.parsed_text.tolist()

In [15]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(corpus)

In [16]:
words = count_vect.get_feature_names()

In [None]:
# len(words) = 20197

### Exploring count data

In [17]:
stars = samp.stars.reshape(10000,1)
counts = X_counts.toarray()

In [18]:
C = np.concatenate((stars,counts),axis=1)

In [19]:
df = pd.DataFrame(C)

In [20]:
df.columns = ['star_rating'] + words

In [21]:
star_counts = dict(samp.stars.value_counts())

In [22]:
groups = df.groupby(['star_rating']).sum().T

In [23]:
for k,v in star_counts.items():
    groups[k] = groups[k]/v

In [25]:
groups.sort([5],ascending=False).head(30)

  if __name__ == '__main__':


star_rating,1,2,3,4,5
great,0.114424,0.237089,0.353175,0.539394,0.583479
place,0.500403,0.647887,0.584921,0.574242,0.496132
food,0.534247,0.793427,0.680159,0.562121,0.436736
good,0.280419,0.637324,0.86746,0.72197,0.389069
time,0.445608,0.444836,0.36746,0.307197,0.329174
service,0.438356,0.484742,0.353175,0.313258,0.316446
like,0.402901,0.610329,0.54127,0.430682,0.309209
one,0.467365,0.471831,0.383333,0.358333,0.284253
best,0.064464,0.123239,0.134127,0.158712,0.282256
love,0.044319,0.116197,0.114286,0.167045,0.273272


### TFIDF

In [26]:
vectorizer = TfidfVectorizer(min_df=1)
X_tfidf = vectorizer.fit_transform(corpus)

In [27]:
tfidfs = X_tfidf.toarray()
T = np.concatenate((stars,tfidfs),axis=1)
df_tfidf = pd.DataFrame(T)
df_tfidf.columns = ['star_rating'] + words
tfidf_groups = df.groupby(['star_rating']).sum().T


In [28]:
for k,v in star_counts.items():
    tfidf_groups[k] = tfidf_groups[k]/v

In [32]:
tfidfs[:5][:5]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [None]:
y = np.array(samp.stars.tolist())

### Train-Test split

Need to split the dataset into train and test, then use cross val to traing the classifier.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30,
                                                    stratify=y)

In [None]:
sss = StratifiedShuffleSplit(y_train, 5, test_size=0.3, random_state=0)

### Training a classifier

In [None]:
mnb = MultinomialNB().fit(X_train,y_train)

In [None]:
preds = mnb.predict(X_test)

In [None]:
pd.Series(preds).hist()

In [None]:
pd.Series(y_test).hist()

### Accuracy Metrics

importnatn to consider what is the best accuracy measure to use to test results. notice that star rating is a ordered 