In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

## Data exploration

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [7]:
train_df[train_df.target == 1].text.values[1]

'Forest fire near La Ronge Sask. Canada'

In [8]:
train_df[train_df.target == 0].text.values[1]

'I love fruits'

## Vectorize

In [10]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [11]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [16]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

### Using Tf-idf

In [45]:
count_vectorizer = feature_extraction.text.TfidfVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [47]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0.         0.         0.         0.23336118 0.28924517 0.23336118
  0.         0.         0.         0.         0.         0.
  0.28924517 0.28924517 0.         0.         0.         0.
  0.28924517 0.         0.         0.         0.         0.
  0.         0.28924517 0.         0.         0.         0.28924517
  0.         0.         0.         0.         0.28924517 0.
  0.         0.         0.         0.28924517 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.28924517 0.23336118 0.         0.28924517 0.        ]]


In [48]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

## Model

In [52]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [57]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=5, scoring="f1")
scores

array([0.62962963, 0.55507372, 0.64457332, 0.59444444, 0.72337043])

In [58]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

## Submission

In [59]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)

In [60]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [61]:
sample_submission.to_csv("data/submission.csv", index=False)

## Visual check

In [62]:
df_check = pd.DataFrame({'text':test_df['text'] , 'is_disaster': sample_submission["target"].astype(bool)}) 

In [63]:
df_check[df_check.is_disaster == True].sample(5)

Unnamed: 0,text,is_disaster
1547,Highway Patrol reports uptick in statewide ped...,True
2522,Watch This Airport Get Swallowed Up By A Sands...,True
1055,@FixWMATA @AdamTuss Do you know about path of ...,True
2811,Grandma's friends survived to the attack in Hi...,True
2117,Byproduct of #metal price meltdown is a higher...,True


In [64]:
df_check[df_check.is_disaster == False].sample(5)

Unnamed: 0,text,is_disaster
792,Husband's back from Edinburgh and crashed out....,False
1153,Track : Apollo Brown - Detonate ft. M.O.P. | ...,False
2790,Check out my Lightroom collection ÛÏJordyne S...,False
2671,That organic freestyle with Casey veggies is c...,False
477,@jonathanshainin I think the bomb raises all s...,False
