In [1]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
from sklearn.cross_validation import train_test_split
# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


#### Let's first play with Yelp data. Earlier, we performed sentiment analysis on this dataset and were able to achieve 80% accuracy using Random Forest.  Let's check and see if we can beat that with our new tools! For this practice project you shall refer to our earlier codes i.e. [notebook 1](https://github.com/ga-students/SF-DAT-20/blob/master/Code/Lecture13.ipynb) and [notebook 2](https://github.com/ga-students/SF-DAT-20/blob/master/Code/Lecture13-Practice-Solution.ipynb)

In [2]:
# let's load data and put it in a dataframe
rows = []
with open('/Users/hamed/Desktop/SF-DAT-20/SF-DAT-20/Data/yelp_labelled.txt') as f:
    for i,line in enumerate(f.readlines()):
        row = (line.split('\n')[0]).split('\t')
        if row[1] == '':
            row[1] = np.nan
        else:
            row[1] = int(row[1])
        rows.append(row)

In [3]:
Yelp_data = pd.DataFrame(rows,columns=['text','sentiment'])
Yelp_data.dropna(inplace = True)
Yelp_data.head()

Unnamed: 0,text,sentiment
0,Wow... Loved this place.,1
3,Crust is not good.,0
4,Not tasty and the texture was just nasty.,0
10,Stopped by during the late May bank holiday of...,1
11,The selection on the menu was great and so wer...,1


#### Split data to 80% training and 20% test set. 

In [9]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
# Resetting our data
X_train,X_test,y_train,y_test = train_test_split(Yelp_data['text'],Yelp_data['sentiment'],test_size=0.2)

#### Here are few libararies we do need from here on

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB


#### Use Pipeline and define CountVectorizer() as 'vect' and MultiNomial Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [11]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?

In [12]:
fit_grid = gs_clf.fit(X_train,y_train)


In [13]:
fit_grid.score(X_test,y_test)  #3% better than Random Forest


0.83999999999999997

#### Use Pipeline and define CountVectorizer() as 'vect' and Bernoulli Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [14]:

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?


In [15]:
fit_grid = gs_clf.fit(X_train,y_train)
fit_grid.score(X_test,y_test)


0.81999999999999995

#### What parameters are chosen by GridSearchCV?

In [16]:
gs_clf.best_params_

{'clf__alpha': 0.1, 'vect__max_df': 200, 'vect__min_df': 1}

#### Now it's time for a new dataset! Let's play with SMS dataset. We would like to develop a model by which filter spam/ham text messages. Let's explore this dataset first.

In [17]:
import pandas as pd
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/SMSSpamCollection.tsv"
col_names = ['label', 'message']
smsData = pd.read_csv(url, sep='\t', header = 0,names=col_names)
smsData.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
smsData.shape

(5572, 2)

#### Repeat the procedure you applied on Yelp data on SMS data. Can you get better results by using Bernoulli Naive Bayes or MultiNomial Naive Bayes? What is the best score on test set using best tuning parameters?

In [19]:
X_train,X_test,y_train,y_test = train_test_split(smsData['message'],smsData['label'],test_size=0.2)

In [20]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

In [21]:
fit_grid = gs_clf.fit(X_train,y_train)
fit_grid.score(X_test,y_test)

0.98654708520179368

In [22]:
fit_grid.best_params_

{'clf__alpha': 0.2, 'vect__max_df': 200, 'vect__min_df': 3}

In [23]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

In [24]:
fit_grid = gs_clf.fit(X_train,y_train)
fit_grid.score(X_test,y_test)

0.99192825112107619

In [25]:
fit_grid.best_params_

{'clf__alpha': 0.1, 'vect__max_df': 1000, 'vect__min_df': 3}

Answer: The best model we found was based on a Bernoulli Naive Bayes algorithm. Our accuracy is more than 99%!

#### Print out misclassified instances in your test set. 

In [26]:
#Misclassified instances
count  = range(len(y_test))
for i in count:
    if fit_grid.predict(X_test)[i] != y_test.values[i]:
        print (X_test.values[i])

You have 1 new message. Please call 08712400200.
Hey...Great deal...Farm tour 9am to 5pm $95/pax, $50 deposit by 16 May
Ever thought about living a good life with a perfect partner? Just txt back NAME and AGE to join the mobile community. (100p/SMS)
Can U get 2 phone NOW? I wanna chat 2 set up meet Call me NOW on 09096102316 U can cum here 2moro Luv JANE xx Calls£1/minmoremobsEMSPOBox45PO139WA
How come it takes so little time for a child who is afraid of the dark to become a teenager who wants to stay out all night?
For sale - arsenal dartboard. Good condition but no doubles or trebles!
You have 1 new message. Call 0207-083-6089
Hello. We need some posh birds and chaps to user trial prods for champneys. Can i put you down? I need your address and dob asap. Ta r
Do you ever notice that when you're driving, anyone going slower than you is an idiot and everyone driving faster than you is a maniac?
