# Tfidf Bag of Words

## Loading data

In [55]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


## Utility functions

In [72]:
def prepare_submission(model, X, y, X_test, name):
    model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)

# Creating the model

In [86]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
vectorizer = TfidfVectorizer(min_df=2)
X = vectorizer.fit_transform(train['cleaned_text'])
X_test = vectorizer.transform(test['cleaned_text'])
X.max()

1.0

In [88]:
X.min()

0.0

In [89]:
vectorizer.get_feature_names()

['00',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '087809233445',
 '09',
 '10',
 '100',
 '1023',
 '11',
 '1141',
 '13',
 '130',
 '14',
 '15',
 '15pm',
 '16',
 '17',
 '18',
 '20',
 '2009',
 '2014',
 '2015',
 '21',
 '22',
 '24',
 '25',
 '26',
 '267',
 '27',
 '28',
 '29',
 '30',
 '300',
 '300000',
 '30a',
 '30pm',
 '32',
 '35',
 '36',
 '38pm',
 '40',
 '41',
 '45',
 '45pm',
 '495',
 '50',
 '52',
 '54',
 '57',
 '5km',
 '5th',
 '65',
 '6773',
 '75',
 '77',
 '83',
 '8whts',
 '90',
 '94',
 '98',
 '99',
 '__',
 'a1',
 'a5',
 'a_',
 'aa',
 'aba',
 'abandon',
 'abandoned',
 'abbott',
 'abbswinston',
 'abc',
 'abcnews',
 'abe',
 'abia',
 'ability',
 'ablaze',
 'able',
 'abomination',
 'abortion',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abstorm',
 'abuse',
 'abused',
 'abusing',
 'ac',
 'acc',
 'accept',
 'access',
 'accident',
 'accidentally',
 'accidents',
 'according',
 'account',
 'accounts',
 'accused',
 'accuses',
 'accustomed',
 'acdelco',
 'acid',
 'acoustic',
 'acqu

In [90]:
len(vectorizer.get_feature_names())

6213

In [91]:
y = train['target']

In [92]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [93]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=cv, return_train_score=True, scoring='f1')
scores

{'fit_time': array([0.06124711, 0.13653231, 0.04688573, 0.05641532, 0.06427431]),
 'score_time': array([0.00187802, 0.00197506, 0.00192547, 0.00178385, 0.00185061]),
 'test_score': array([0.75355054, 0.75171233, 0.75279931, 0.75446049, 0.75643225]),
 'train_score': array([0.84294401, 0.84121977, 0.84279109, 0.8395688 , 0.84032904])}

In [94]:
scores['test_score'].mean()

0.7537909845013462

In [95]:
prepare_submission(clf, X, y, X_test, 'tfid_logisticregression')

## Try a naive Bayes Model

In [82]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0.4)

In [83]:
from sklearn.model_selection import cross_validate
scores = cross_validate(nb, X, y, cv=cv, return_train_score=True, scoring='f1')
scores

{'fit_time': array([0.00411582, 0.00370026, 0.0037303 , 0.00450373, 0.00361228]),
 'score_time': array([0.00226736, 0.00206447, 0.00319815, 0.00181079, 0.0018971 ]),
 'test_score': array([0.75311721, 0.75170068, 0.73738238, 0.72431507, 0.76295667]),
 'train_score': array([0.84265881, 0.84219374, 0.84221569, 0.84345451, 0.83665677])}

In [84]:
scores['test_score'].mean()

0.7458944006694941

Doesn't help much...

In [85]:
prepare_submission(nb, X, y, X_test, 'tfid_multinomialnb')

In [14]:
vectorizer

TfidfVectorizer(min_df=2)

In [15]:
vectorizer.get_feature_names()

['00',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '087809233445',
 '09',
 '10',
 '100',
 '1023',
 '11',
 '1141',
 '13',
 '130',
 '14',
 '15',
 '15pm',
 '16',
 '17',
 '18',
 '20',
 '2009',
 '2014',
 '2015',
 '21',
 '22',
 '24',
 '25',
 '26',
 '267',
 '27',
 '28',
 '29',
 '30',
 '300',
 '300000',
 '30a',
 '30pm',
 '32',
 '35',
 '36',
 '38pm',
 '40',
 '41',
 '45',
 '45pm',
 '495',
 '50',
 '52',
 '54',
 '57',
 '5km',
 '5th',
 '65',
 '6773',
 '75',
 '77',
 '83',
 '8whts',
 '90',
 '94',
 '98',
 '99',
 '__',
 'a1',
 'a5',
 'a_',
 'aa',
 'aba',
 'abandon',
 'abandoned',
 'abbott',
 'abbswinston',
 'abc',
 'abcnews',
 'abe',
 'abia',
 'ability',
 'ablaze',
 'able',
 'abomination',
 'abortion',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abstorm',
 'abuse',
 'abused',
 'abusing',
 'ac',
 'acc',
 'accept',
 'access',
 'accident',
 'accidentally',
 'accidents',
 'according',
 'account',
 'accounts',
 'accused',
 'accuses',
 'accustomed',
 'acdelco',
 'acid',
 'acoustic',
 'acqu

In [19]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=["idf_weights"]) 
df_idf

Unnamed: 0,idf_weights
00,6.712015
01,7.532996
02,8.544597
03,8.832279
04,7.628306
...,...
zombie,8.139132
zone,6.404530
zones,8.832279
zouma,8.321453


In [25]:
df_idf.sort_values(by=['idf_weights'],ascending=False)

Unnamed: 0,idf_weights
zss,8.832279
oped,8.832279
director,8.832279
directors,8.832279
opp,8.832279
...,...
and,2.784119
of,2.538243
to,2.498407
in,2.452156


In [29]:
idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [32]:
idf['mh370']

5.6682111074095625