In [23]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
from sklearn.cross_validation import train_test_split
# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


#### Let's first play with Yelp data. Earlier, we performed sentiment analysis on this dataset and were able to achieve 80% accuracy using Random Forest.  Let's check and see if we can beat that with our new tools! For this practice project you shall refer to our earlier codes i.e. [notebook 1](https://github.com/ga-students/SF-DAT-20/blob/master/Code/Lecture13.ipynb) and [notebook 2](https://github.com/ga-students/SF-DAT-20/blob/master/Code/Lecture13-Practice-Solution.ipynb)

In [24]:
# let's load data and put it in a dataframe
rows = []
with open('/Users/karla/SF-dat/SF-DAT-20/Data/yelp_labelled.txt') as f:
    for i,line in enumerate(f.readlines()):
        row = (line.split('\n')[0]).split('\t')
        if row[1] == '':
            row[1] = np.nan
        else:
            row[1] = int(row[1])
        rows.append(row)

In [25]:
Yelp_data = pd.DataFrame(rows,columns=['text','sentiment'])
Yelp_data.dropna(inplace = True)
Yelp_data.head()

Unnamed: 0,text,sentiment
0,Wow... Loved this place.,1
3,Crust is not good.,0
4,Not tasty and the texture was just nasty.,0
10,Stopped by during the late May bank holiday of...,1
11,The selection on the menu was great and so wer...,1


#### Split data to 80% training and 20% test set. 

In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
count_vect = CountVectorizer(stop_words='english')
bag_o_words = count_vect.fit_transform(Yelp_data['text'])
bag_o_words


<1000x1820 sparse matrix of type '<type 'numpy.int64'>'
	with 4904 stored elements in Compressed Sparse Row format>

In [27]:
count_vect.get_feature_names()


[u'00',
 u'10',
 u'100',
 u'11',
 u'12',
 u'15',
 u'17',
 u'1979',
 u'20',
 u'2007',
 u'23',
 u'30',
 u'30s',
 u'35',
 u'40',
 u'40min',
 u'45',
 u'4ths',
 u'5lb',
 u'70',
 u'85',
 u'90',
 u'99',
 u'absolute',
 u'absolutely',
 u'absolutley',
 u'accident',
 u'accommodations',
 u'accomodate',
 u'accordingly',
 u'accountant',
 u'ache',
 u'acknowledged',
 u'actual',
 u'actually',
 u'added',
 u'affordable',
 u'afternoon',
 u'ago',
 u'ahead',
 u'airline',
 u'airport',
 u'ala',
 u'albondigas',
 u'allergy',
 u'almonds',
 u'amazing',
 u'ambiance',
 u'ambience',
 u'ample',
 u'andddd',
 u'angry',
 u'annoying',
 u'anticipated',
 u'anymore',
 u'anytime',
 u'anyways',
 u'apart',
 u'apologize',
 u'apology',
 u'app',
 u'appalling',
 u'apparently',
 u'appealing',
 u'appetite',
 u'appetizer',
 u'appetizers',
 u'apple',
 u'approval',
 u'area',
 u'aren',
 u'arepas',
 u'aria',
 u'array',
 u'arrived',
 u'arrives',
 u'arriving',
 u'article',
 u'ask',
 u'asked',
 u'asking',
 u'assure',
 u'ate',
 u'atmosphere'

#### Here are few libararies we do need from here on

In [28]:
y = Yelp_data['sentiment']
X = Yelp_data['text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB


#### Use Pipeline and define CountVectorizer() as 'vect' and MultiNomial Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [29]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])


#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?

In [32]:
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)


In [36]:
fit_grid = gs_clf.fit(X_train,y_train)
print fit_grid.score(X_test,y_test)
print fit_grid.best_params_

0.805
{'vect__max_df': 100, 'vect__min_df': 1, 'clf__alpha': 1}


#### Use Pipeline and define CountVectorizer() as 'vect' and Bernoulli Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [37]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)



#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?


In [38]:
fit_grid = gs_clf.fit(X_train,y_train)
print fit_grid.score(X_test,y_test)
print fit_grid.best_params_


0.795
{'vect__max_df': 200, 'vect__min_df': 1, 'clf__alpha': 0.2}


#### What parameters are chosen by GridSearchCV?

In [None]:
This Bernoulli model gives the worst result.

#### Now it's time for a new dataset! Let's play with SMS dataset. We would like to develop a model by which filter spam/ham text messages. Let's explore this dataset first.

In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/SMSSpamCollection.tsv"
col_names = ['label', 'message']
smsData = pd.read_csv(url, sep='\t', header = 0,names=col_names)
smsData.head(5)

In [None]:
smsData.shape

#### Repeat the procedure you applied on Yelp data on SMS data. Can you get better results by using Bernoulli Naive Bayes or MultiNomial Naive Bayes? What is the best score on test set using best tuning parameters?

Answer: 

#### Print out misclassified instances in your test set. 