# Simple model
Let's try a bag of word model using only the tweet text.

# Loading data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


# Creating the model

In [2]:
len(train)

7561

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train['cleaned_text'])
X.max()

13

In [5]:
vectorizer.get_feature_names()

['00',
 '0000',
 '00pm',
 '01',
 '02',
 '02pm',
 '03',
 '033',
 '04',
 '05',
 '06',
 '07',
 '08',
 '0853',
 '087809233445',
 '0880',
 '09',
 '10',
 '100',
 '1000',
 '101',
 '1017',
 '1023',
 '10m',
 '10pm',
 '11',
 '1141',
 '115',
 '12am',
 '12m',
 '12pm',
 '13',
 '130',
 '133',
 '13pm',
 '14',
 '14000',
 '141',
 '15',
 '15p',
 '15pm',
 '15pmemergency',
 '15t',
 '16',
 '1600',
 '163',
 '17',
 '17000',
 '1717',
 '18',
 '19',
 '1984',
 '1pack',
 '1st',
 '20',
 '2002',
 '2007',
 '2009',
 '2010',
 '2013',
 '2014',
 '2015',
 '21',
 '22',
 '22pm',
 '23',
 '231a',
 '2327564d',
 '238',
 '23928835',
 '24',
 '25',
 '250',
 '26',
 '263789f4',
 '265v',
 '267',
 '27',
 '28',
 '28pm',
 '29',
 '29pm',
 '30',
 '300',
 '300000',
 '300m',
 '30_',
 '30a',
 '30bst',
 '30p',
 '30pm',
 '30pmhs',
 '31pm',
 '32',
 '33333',
 '33pm',
 '34',
 '3460',
 '35',
 '350',
 '36',
 '37b',
 '38',
 '38pm',
 '3a',
 '3g',
 '3km',
 '3others',
 '40',
 '4000',
 '400dr',
 '405',
 '40mln',
 '40pm',
 '41',
 '410',
 '415',
 '43',
 

In [6]:
len(vectorizer.get_feature_names())

14695

In [7]:
y = train['target']

In [8]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [9]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([0.342659  , 0.22020388, 0.24398232]),
 'score_time': array([0.0021944 , 0.00283074, 0.00204086]),
 'test_score': array([0.62784248, 0.6103764 , 0.67172484]),
 'train_score': array([0.95326216, 0.95581562, 0.9503445 ])}

In [10]:
scores['test_score'].mean()

0.6366479063787999

In [11]:
scores['train_score'].mean()

0.9531407595376038

## Submit

In [12]:
X_test = vectorizer.transform(test['cleaned_text'])

In [13]:
def prepare_submission(model, X, y, name):
    model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)

In [14]:
prepare_submission(clf, X, y, 'bag_of_words1')

# Conclusion
The simple model performs better than the baseline, but seems to horribly overfit the data. Some things we could try:
 * Regularization of the model
 * Using less features

## Regularize the model

In [15]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(Cs=[0.000000001, 0.0000001, 0.000001, 0.00001, 0.001, 0.01, 0.1, 1], max_iter = 1000)

In [16]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([2.67092943, 2.56501245, 2.67746353]),
 'score_time': array([0.00222397, 0.00207424, 0.00236821]),
 'test_score': array([0.61681159, 0.59123683, 0.67610063]),
 'train_score': array([0.84263703, 0.84205154, 0.83599493])}

In [17]:
scores['test_score'].mean()

0.6280496835478074

In [18]:
scores['train_score'].mean()

0.8402278345351964

In [19]:
prepare_submission(clf, X, y, 'bag_of_words_cv')

Didn't seem to help much, also not in the public score..

## Reduce the number of features

In [20]:
vectorizer = CountVectorizer(min_df=2)
X = vectorizer.fit_transform(train['cleaned_text'])

In [21]:
X.max()

13

In [22]:
len(vectorizer.get_feature_names())

6213

In [23]:
clf = LogisticRegressionCV(Cs=[0.000000001, 0.0000001, 0.000001, 0.00001, 0.001, 0.01, 0.1, 1], max_iter = 1000)
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([1.44132328, 1.31011128, 1.37084222]),
 'score_time': array([0.00211024, 0.00238442, 0.00208759]),
 'test_score': array([0.61885483, 0.59234609, 0.67574646]),
 'train_score': array([0.82720122, 0.82802875, 0.81945505])}

In [24]:
scores['test_score'].mean()

0.6289824611162446

Doesn't seem to help much, also with the public score