# Applying TFIDF

### Using the Library

In [1]:
import pandas as pd

df = pd.read_csv('./Reviews.csv.zip')

coconut_df = pd.read_csv('./coconut_water.csv', index_col = 0)

In [15]:
document = coconut_df.Text.iloc[0]

In [32]:
nlp(document)[0].is_alpha

True

In [51]:
from spacy.lang.en.stop_words import STOP_WORDS
import spacy 

nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(document):
    tokens = [ word.lemma_.lower().strip() for word in nlp(document) if word.is_alpha]
    return tokens

In [52]:
spacy_tokenizer(document)

['must',
 'admit',
 'the',
 'taste',
 'of',
 'coconut',
 'water',
 'be',
 'well',
 'take',
 'a',
 'long',
 'time',
 'to',
 'get',
 'through',
 'the',
 'supply',
 'of',
 'coconut',
 'water']

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(stop_words = STOP_WORDS, tokenizer = spacy_tokenizer)

In [54]:
vectors = vectorizer.fit_transform(coconut_df.Text)

  'stop_words.' % sorted(inconsistent))


In [67]:
X = pd.DataFrame(vectors.toarray(), columns = vectorizer.get_feature_names())

In [68]:
X.iloc[0].sort_values(ascending = False)[:10]

supply        0.568040
admit         0.540922
long          0.368176
time          0.300971
water         0.267879
coconut       0.260485
taste         0.137787
zombie        0.000000
flavored      0.000000
flavourful    0.000000
Name: 0, dtype: float64

In [69]:
X.iloc[1].sort_values(ascending = False)[:10]

choc          0.471929
stuff         0.387192
case          0.254795
milk          0.253180
intolerant    0.251842
cow           0.251842
lactose       0.224700
forward       0.215962
dark          0.188820
smooth        0.185088
Name: 1, dtype: float64

### Classify Text

In [71]:
y = coconut_df.Score

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,  y, stratify = y)

In [74]:
from sklearn.linear_model import LogisticRegression

In [75]:
model = LogisticRegression()
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.5789473684210527

In [79]:
pd.Series(model.coef_[0], X.columns).sort_values(ascending = False)

concentrate    1.392313
buy            1.208115
product        1.057128
bad            0.830559
new            0.716045
                 ...   
refreshing    -0.609906
good          -0.611452
great         -0.638816
love          -0.725603
chocolate     -0.845501
Length: 2289, dtype: float64

### Only Term Frequency

In [141]:
vectorizer_no_idf = TfidfVectorizer(tokenizer = spacy_tokenizer, use_idf = False)
vectors_no_idf = vectorizer_no_idf.fit_transform(coconut_df.Text)

In [142]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectors_no_idf,  y, stratify = y)

In [143]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.6052631578947368

In [146]:
from sklearn.linear_model import LogisticRegressionCV

model_cv = LogisticRegressionCV(max_iter=2000)
model_cv.fit(X_train, y_train)

model_cv.score(X_test, y_test)

0.6491228070175439

In [147]:
model_cv.get_params()

{'Cs': 10,
 'class_weight': None,
 'cv': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1.0,
 'l1_ratios': None,
 'max_iter': 2000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'refit': True,
 'scoring': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0}

In [149]:
pd.Series(model_cv.coef_[0],vectorizer_no_idf.get_feature_names()).sort_values(ascending = False)[:10]

concentrate    2.277066
this           1.837528
new            1.607290
buy            1.518834
plastic        1.507224
i              1.397353
product        1.381952
water          1.277999
bad            1.255077
have           1.117246
dtype: float64

In [152]:
pd.Series(model_cv.coef_[-1],vectorizer_no_idf.get_feature_names()).sort_values(ascending = False)[:15]

great        2.814344
chocolate    2.538254
love         2.095730
and          1.911344
good         1.449914
as           1.406424
delicious    1.045572
try          1.020831
all          0.964422
many         0.919306
more         0.908969
favorite     0.901128
healthy      0.858939
can          0.851005
milk         0.805698
dtype: float64

### Bow Model

In [107]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizor = CountVectorizer(tokenizer = spacy_tokenizer, stop_words=STOP_WORDS, ngram_range=(1,1))

In [108]:
bow_vectors = bow_vectorizor.fit_transform(coconut_df.Text)

  'stop_words.' % sorted(inconsistent))


In [109]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bow_vectors,  y, stratify = y)

In [111]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.6052631578947368

Notice that the bag of words model performs better than TFIDF, but not better than using the term frequency model.

### Using Ngrams

### Summary