In [7]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


%matplotlib notebook

from sklearn.cross_validation import train_test_split

# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#### read yelp_labelled data and split it using \n and \t

In [4]:
df = pd.read_csv("../Data/yelp_labelled.txt",
                 names=['Text','Sentiment'],
                 na_values=np.nan,sep='\t',
                 error_bad_lines=False)
df.count()

Text         3729
Sentiment    1000
dtype: int64

#### Put your yelp data into a dataframe and drop na values.

In [5]:
df.dropna(inplace=True)
df.count()

Text         1000
Sentiment    1000
dtype: int64

#### Using Pipeline, RandomForestClasifier, and GridSearchCV, play with min_df and max_df on your yelp data. Split your data to test and training. You can use either of CountVetorizer or TfidfVectorizer

In [8]:
count_vect = CountVectorizer(stop_words="english")
word_bag = count_vect.fit_transform(df["Text"])
word_bag

<1000x1820 sparse matrix of type '<type 'numpy.int64'>'
	with 4904 stored elements in Compressed Sparse Row format>

In [9]:
count_vect.get_feature_names()

[u'00',
 u'10',
 u'100',
 u'11',
 u'12',
 u'15',
 u'17',
 u'1979',
 u'20',
 u'2007',
 u'23',
 u'30',
 u'30s',
 u'35',
 u'40',
 u'40min',
 u'45',
 u'4ths',
 u'5lb',
 u'70',
 u'85',
 u'90',
 u'99',
 u'absolute',
 u'absolutely',
 u'absolutley',
 u'accident',
 u'accommodations',
 u'accomodate',
 u'accordingly',
 u'accountant',
 u'ache',
 u'acknowledged',
 u'actual',
 u'actually',
 u'added',
 u'affordable',
 u'afternoon',
 u'ago',
 u'ahead',
 u'airline',
 u'airport',
 u'ala',
 u'albondigas',
 u'allergy',
 u'almonds',
 u'amazing',
 u'ambiance',
 u'ambience',
 u'ample',
 u'andddd',
 u'angry',
 u'annoying',
 u'anticipated',
 u'anymore',
 u'anytime',
 u'anyways',
 u'apart',
 u'apologize',
 u'apology',
 u'app',
 u'appalling',
 u'apparently',
 u'appealing',
 u'appetite',
 u'appetizer',
 u'appetizers',
 u'apple',
 u'approval',
 u'area',
 u'aren',
 u'arepas',
 u'aria',
 u'array',
 u'arrived',
 u'arrives',
 u'arriving',
 u'article',
 u'ask',
 u'asked',
 u'asking',
 u'assure',
 u'ate',
 u'atmosphere'

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

X = word_bag
y = df["Sentiment"]
y.head()
X_train,X_test,y_train,y_test = train_test_split(df['Text'],df['Sentiment'],test_size=0.2)
text_clf = Pipeline([('vect', CountVectorizer()),
                    ("clf", RandomForestClassifier())])



In [16]:
parameters = {"vect__min_df": [1,2,3],
              "vect__max_df" : [50, 100, 200, 300, 400, 600],
              "clf__n_estimators" : [600, 1000]}

gs_clf = GridSearchCV(text_clf, parameters)
fit_grid = gs_clf.fit(X_train, y_train)

In [27]:
print fit_grid.score(X_test, y_test)
print fit_grid.best_params_

0.77
{'vect__min_df': 1, 'clf__n_estimators': 200, 'vect__max_df': 100}


#### How much test error do you get based on the optimizer you found above?

Test error is 0.23 for optimizer

#### Look over few X_test instances and compare the category predicted for the observation and the actual review sentence. 

In [25]:
print fit_grid.predict(X_test)[:5]

print y_test.head()
X_test[fit_grid.predict(X_test) == y_test].count()


[ 0.  0.  1.  0.  1.]
3151    0
91      1
2311    1
1429    0
1115    1
Name: Sentiment, dtype: float64


154

## Bonus Quetions: Can you find the test instances that are correctly classified and thos that are misclassified?

In [33]:
#Misclassified instances
misclassified = X_test[fit_grid.predict(X_test) != y_test]
misclassified.head()

91                 My first visit to Hiro was a delight!
741        The Veggitarian platter is out of this world!
696                     They know how to make them here.
759    The goat taco didn't skimp on the meat and wow...
81     seems like a good quick place to grab a bite o...
Name: Text, dtype: object

In [35]:
#Correctly Classified instances
correct = X_test[fit_grid.predict(X_test) == y_test]
correct.head()

3151    The building itself seems pretty neat; the bat...
2311                    The jalapeno bacon is soooo good.
1429    Seafood was limited to boiled shrimp and crab ...
1115    I will continue to come here on ladies night a...
1679                            AVOID THIS ESTABLISHMENT!
Name: Text, dtype: object