# NATURAL LANGUAGE PROCESSING

## Data Preparation

In [1]:
import numpy as np
import pandas as pd
import nltk 

df = pd.read_csv('yelp.csv')
df = df[['stars','text', 'cool','useful','funny']]
df.head()

Unnamed: 0,stars,text,cool,useful,funny
0,5,My wife took me here on my birthday for breakf...,2,5,0
1,5,I have no idea why some people give bad review...,0,0,0
2,4,love the gyro plate. Rice is so good and I als...,0,1,0
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1,2,0
4,5,General Manager Scott Petello is a good egg!!!...,0,0,0


### Punctuation, Text Length, Tokenization

In [2]:
# remove punctuation, add a clolumn with text length, , make lower cases

df['text'] = df['text'].str.lower() 
df['text lenght'] = (df['text'].str.split('[\W_]+'))
df['text lenght'] = df['text lenght'].str.len()

df.head()

# NOTE TO MYSELF
# '\w' is a special character that will match any alphanumeric A-z, a-z, 0-9, along with underscores;
# '+' means that the previous character in the regex can appear as many times as we want
# This means that '\w+'' will match arbitrary sequences of alphanumeric characters and underscores.

Unnamed: 0,stars,text,cool,useful,funny,text lenght
0,5,my wife took me here on my birthday for breakf...,2,5,0,161
1,5,i have no idea why some people give bad review...,0,0,0,266
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79
4,5,general manager scott petello is a good egg!!!...,0,0,0,89


In [3]:
# prepare a column with the text splited (without puntuation), 
df['text_split'] = (df['text'].str.split('[\W_]+'))
df['text_split'] = df['text_split'].astype(str)
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my..."
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe..."
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',..."
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa..."
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is..."


In [4]:
# this does exactly the same as above - but it keeps the punctuation and doe snot have the ''
from nltk.tokenize import word_tokenize

df['tokenized'] = df['text'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f..."
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b..."
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good..."
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,..."
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good..."


In [6]:
nltk.word_tokenize(df.iloc[0]['text'])

['my',
 'wife',
 'took',
 'me',
 'here',
 'on',
 'my',
 'birthday',
 'for',
 'breakfast',
 'and',
 'it',
 'was',
 'excellent',
 '.',
 'the',
 'weather',
 'was',
 'perfect',
 'which',
 'made',
 'sitting',
 'outside',
 'overlooking',
 'their',
 'grounds',
 'an',
 'absolute',
 'pleasure',
 '.',
 'our',
 'waitress',
 'was',
 'excellent',
 'and',
 'our',
 'food',
 'arrived',
 'quickly',
 'on',
 'the',
 'semi-busy',
 'saturday',
 'morning',
 '.',
 'it',
 'looked',
 'like',
 'the',
 'place',
 'fills',
 'up',
 'pretty',
 'quickly',
 'so',
 'the',
 'earlier',
 'you',
 'get',
 'here',
 'the',
 'better',
 '.',
 'do',
 'yourself',
 'a',
 'favor',
 'and',
 'get',
 'their',
 'bloody',
 'mary',
 '.',
 'it',
 'was',
 'phenomenal',
 'and',
 'simply',
 'the',
 'best',
 'i',
 "'ve",
 'ever',
 'had',
 '.',
 'i',
 "'m",
 'pretty',
 'sure',
 'they',
 'only',
 'use',
 'ingredients',
 'from',
 'their',
 'garden',
 'and',
 'blend',
 'them',
 'fresh',
 'when',
 'you',
 'order',
 'it',
 '.',
 'it',
 'was',
 'ama

In [5]:
# QUESTION

# Does the format difference means something? 
    # 'text_split ' ['my', 'wife', 'took', 'me', 'here']
        - This is a list of strings
    # 'tokenized' [my, wife, took, me, here]
        - This is a list of strings
        
        Hence, no difference.


# WHy 1/ works and not 2/ (TypeError: expected string or bytes-like object) ?
    # 1/ df['tokenized'] = df['text'].apply(nltk.word_tokenize)
    # 2/ tk = (nltk.word_tokenize(df['text']))   >> https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/

    # 1/ works because when you use .apply(), that method is applied on every single record individually.
    # 2/ tk = (nltk.word_tokenize(df['text'])) <-----  This does not work because you are trying to pass a series, while 
    # it can only consume a string. Modifying it a little bit will work. Have a look at the next cell.

In [7]:
# Putting [0] means I'm taking a string and then passing it inside the method.

tk = (nltk.word_tokenize(df['text'][0]))
print(tk)

['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excellent', '.', 'the', 'weather', 'was', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'grounds', 'an', 'absolute', 'pleasure', '.', 'our', 'waitress', 'was', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'saturday', 'morning', '.', 'it', 'looked', 'like', 'the', 'place', 'fills', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', '.', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloody', 'mary', '.', 'it', 'was', 'phenomenal', 'and', 'simply', 'the', 'best', 'i', "'ve", 'ever', 'had', '.', 'i', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', '.', 'it', 'was', 'amazing', '.', 'while', 'everything', 'on', 'the', 'menu', 'looks', 'excellent', ',', 'i', 'had', 'the', 'whit

## Remove Stop Words

### Simple Example with: NLTK

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


### Remove Stop Words on df with NLTK

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in stop_words]))
    return removed_stop_words

df['cleantext'] = df['text'].str.lower()
df['cleantext'] = df['cleantext'].str.replace('[\W_]+',' ')
df['cleantext'] = remove_stop_words(df['cleantext'])
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized,cleantext
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f...",wife took birthday breakfast excellent weather...
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b...",idea people give bad reviews place goes show p...
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good...",love gyro plate rice good also dig candy selec...
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,...",rosie dakota love chaparral dog park convenien...
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good...",general manager scott petello good egg go deta...


In [10]:
# Lambda method

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

word_tokens = df['tokenized'].astype(str).tolist()


# You can simply do something like this.

df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])


0       [wife, took, birthday, breakfast, excellent, ....
1       [idea, people, give, bad, reviews, place, ., g...
2       [love, gyro, plate, ., rice, good, also, dig, ...
3       [rosie, ,, dakota, ,, love, chaparral, dog, pa...
4       [general, manager, scott, petello, good, egg, ...
                              ...                        
9995    [first, visit, ..., lunch, today, -, used, gro...
9996    [called, house, deliciousness, !, could, go, i...
9997    [recently, visited, olive, ivy, business, last...
9998    [nephew, moved, scottsdale, recently, bunch, f...
9999    [4-5, locations.., 4.5, star, average.., think...
Name: tokenized, Length: 10000, dtype: object

## Stemming and Lemmanization

Stemming reduces related words to a common stem.\
It is an optional process step, and it is useful to test accuracy with and without stemming. 

In [11]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
my_list = ['frightening', 'frightened', 'frightens']

# Using a Python list comprehension method to apply to all words in my_list
print ([stemming.stem(word) for word in my_list])


['frighten', 'frighten', 'frighten']


In [12]:
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

word_data = "player learning a play was playing very well"
# First Word tokenization
nltk_tokens = nltk.word_tokenize(word_data)
#Next find the roots of the word
for w in nltk_tokens:
       print("Actual: %s  Stem: %s"  % (w,porter_stemmer.stem(w)))

Actual: player  Stem: player
Actual: learning  Stem: learn
Actual: a  Stem: a
Actual: play  Stem: play
Actual: was  Stem: wa
Actual: playing  Stem: play
Actual: very  Stem: veri
Actual: well  Stem: well


In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

word_data = "player learning a play was playing very well"
nltk_tokens = nltk.word_tokenize(word_data)
for w in nltk_tokens:
       print("Actual: %s  Lemma: %s"  % (w,wordnet_lemmatizer.lemmatize(w)))

Actual: player  Lemma: player
Actual: learning  Lemma: learning
Actual: a  Lemma: a
Actual: play  Lemma: play
Actual: was  Lemma: wa
Actual: playing  Lemma: playing
Actual: very  Lemma: very
Actual: well  Lemma: well


In [14]:
# Stemming
from nltk.stem.porter import PorterStemmer

def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

df['stemmedtext'] = get_stemmed_text(df['cleantext'])
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized,cleantext,stemmedtext
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f...",wife took birthday breakfast excellent weather...,wife took birthday breakfast excel weather per...
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b...",idea people give bad reviews place goes show p...,idea peopl give bad review place goe show plea...
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good...",love gyro plate rice good also dig candy selec...,love gyro plate rice good also dig candi select
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,...",rosie dakota love chaparral dog park convenien...,rosi dakota love chaparr dog park conveni surr...
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good...",general manager scott petello good egg go deta...,gener manag scott petello good egg go detail l...


In [16]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

df['lemmatext'] = get_lemmatized_text(df['cleantext'])
df.head()

Unnamed: 0,stars,text,cool,useful,funny,text lenght,text_split,tokenized,cleantext,stemmedtext,lemmatext
0,5,my wife took me here on my birthday for breakf...,2,5,0,161,"['my', 'wife', 'took', 'me', 'here', 'on', 'my...","[my, wife, took, me, here, on, my, birthday, f...",wife took birthday breakfast excellent weather...,wife took birthday breakfast excel weather per...,wife took birthday breakfast excellent weather...
1,5,i have no idea why some people give bad review...,0,0,0,266,"['i', 'have', 'no', 'idea', 'why', 'some', 'pe...","[i, have, no, idea, why, some, people, give, b...",idea people give bad reviews place goes show p...,idea peopl give bad review place goe show plea...,idea people give bad review place go show plea...
2,4,love the gyro plate. rice is so good and i als...,0,1,0,16,"['love', 'the', 'gyro', 'plate', 'rice', 'is',...","[love, the, gyro, plate, ., rice, is, so, good...",love gyro plate rice good also dig candy selec...,love gyro plate rice good also dig candi select,love gyro plate rice good also dig candy selec...
3,5,"rosie, dakota, and i love chaparral dog park!!...",1,2,0,79,"['rosie', 'dakota', 'and', 'i', 'love', 'chapa...","[rosie, ,, dakota, ,, and, i, love, chaparral,...",rosie dakota love chaparral dog park convenien...,rosi dakota love chaparr dog park conveni surr...,rosie dakota love chaparral dog park convenien...
4,5,general manager scott petello is a good egg!!!...,0,0,0,89,"['general', 'manager', 'scott', 'petello', 'is...","[general, manager, scott, petello, is, a, good...",general manager scott petello good egg go deta...,gener manag scott petello good egg go detail l...,general manager scott petello good egg go deta...


### Word Count: Python style

In [17]:
# word count, make lower cases at the same time
word_count = pd.Series(' '.join(df['lemmatext']).lower().split()).value_counts()
word_count = pd.DataFrame(word_count, columns =['freq'])
wordcount = word_count.reset_index(inplace=True)
word_count.head()

Unnamed: 0,index,freq
0,place,7397
1,good,6857
2,food,6357
3,great,5128
4,like,5109


# Classification

## Pipeline for Machine Learning

Purpose: use the text in order to derive a predicted rating

**Methodology:** 

* a/ <ins>Vectorization</ins>: Count how many times does a word occur in each message (Known as term frequency)

* b/ <ins>Term Weighting :</ins> Weigh the counts, so that frequent tokens get lower weight (inverse document frequency)

* c / <ins>Normalization:</ins> Normalize the vectors to unit length, to abstract from the original text length (L2 norm)


* d/ <ins>Machine Learning</ins>

**Implementation:**

*  Vectorization is done with a  bag of words object transformed into a dataframe (a sparse matrix)

*  Term Weighting and Normalization is done via TF-IDF built in scikit-learns' TfidfTransformer

**Pipeline 1: simple case**

Using a pipeline will streamline the whole process, improve and clarify the implementation

In [18]:
# Pipeline Related
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# ML Related
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report


In [102]:
from sklearn.model_selection import train_test_split

yelp = df[(df['stars'] == 1) | (df['stars'] == 5)]
yelp

X = yelp['lemmatext']
y = yelp['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


In [103]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])


In [104]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [105]:
predictions = pipeline.predict(X_test)

In [106]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[  0 228]
 [  0 998]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       228
           5       0.81      1.00      0.90       998

    accuracy                           0.81      1226
   macro avg       0.41      0.50      0.45      1226
weighted avg       0.66      0.81      0.73      1226



  _warn_prf(average, modifier, msg_start, len(result))


**Pipeline2 : Iterating through models for model selection**

In [25]:
# Pipeline Related
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix,classification_report

In [100]:
from sklearn.model_selection import train_test_split

yelp = df[(df['stars'] == 1) | (df['stars'] == 5)]
yelp

X = yelp['lemmatext']
y = yelp['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [101]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline([('bow', CountVectorizer()), 
                    ('tfidf', TfidfTransformer()),
                    ('classifier', MultinomialNB()) ])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))  
    
   

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
model score: 0.814
SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
model score: 0.814
NuSVC(break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, probability=True, random_state=None, shrinking=True,
      tol=0.001, verbose=False)
model score: 0.814
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                  

**Pipeline3 : Implementing GridSearchCV**

To cross-validate and select the best parameter configuration at the same time, you can use GridSearchCV

In [69]:
# Pipeline Related
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# ML Related
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix,classification_report


from sklearn.model_selection import GridSearchCV

In [97]:
from sklearn.model_selection import train_test_split

yelp = df[(df['stars'] == 1) | (df['stars'] == 5)]
yelp


X = yelp['lemmatext']
y = yelp['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [98]:
# Intake 1

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('ML', LinearSVC()) ])

vect__max_df = [0.8,0.9,1.0]
ML__C = [0.1,1.0]   

param_grid = dict(vect__max_df=vect__max_df, ML__C = ML__C)

# do 5-fold cross validation for each of the 6 possible combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=5, param_grid = param_grid)
grid.fit(X_train,y_train)


# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.933916 using {'ML__C': 1.0, 'vect__max_df': 0.8}
0.860490 (0.003008) with: {'ML__C': 0.1, 'vect__max_df': 0.8}
0.860490 (0.003008) with: {'ML__C': 0.1, 'vect__max_df': 0.9}
0.860490 (0.003008) with: {'ML__C': 0.1, 'vect__max_df': 1.0}
0.933916 (0.008155) with: {'ML__C': 1.0, 'vect__max_df': 0.8}
0.933916 (0.008155) with: {'ML__C': 1.0, 'vect__max_df': 0.9}
0.933916 (0.008155) with: {'ML__C': 1.0, 'vect__max_df': 1.0}


In [99]:
# Intake 2

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('ML', KNeighborsClassifier()) ])

ML__n_neighbors = [5,6,7,8,9,10,11,12,13,1,4,15]   
ML__weights = ['uniform', 'distance']

KNeighborsClassifier()

param_grid = dict(ML__n_neighbors = ML__n_neighbors, ML__weights = ML__weights)

# do 5-fold cross validation for each of the 6 possible combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=5, param_grid = param_grid)
grid.fit(X_train,y_train)


# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.869580 using {'ML__n_neighbors': 12, 'ML__weights': 'uniform'}
0.855245 (0.009976) with: {'ML__n_neighbors': 5, 'ML__weights': 'uniform'}
0.855245 (0.009976) with: {'ML__n_neighbors': 5, 'ML__weights': 'distance'}
0.860490 (0.013695) with: {'ML__n_neighbors': 6, 'ML__weights': 'uniform'}
0.856643 (0.002925) with: {'ML__n_neighbors': 6, 'ML__weights': 'distance'}
0.860490 (0.006762) with: {'ML__n_neighbors': 7, 'ML__weights': 'uniform'}
0.860490 (0.006762) with: {'ML__n_neighbors': 7, 'ML__weights': 'distance'}
0.868531 (0.004870) with: {'ML__n_neighbors': 8, 'ML__weights': 'uniform'}
0.860839 (0.003600) with: {'ML__n_neighbors': 8, 'ML__weights': 'distance'}
0.862238 (0.005462) with: {'ML__n_neighbors': 9, 'ML__weights': 'uniform'}
0.862238 (0.005462) with: {'ML__n_neighbors': 9, 'ML__weights': 'distance'}
0.867832 (0.006216) with: {'ML__n_neighbors': 10, 'ML__weights': 'uniform'}
0.860490 (0.001308) with: {'ML__n_neighbors': 10, 'ML__weights': 'distance'}
0.863636 (0.002925) w

# Sentiment Analysis