In [13]:
from libs.preprocess import *
import pandas as pd

## Data exploration

In [14]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [15]:
train_df[train_df.target == 1].text.values[1]

'Forest fire near La Ronge Sask. Canada'

In [16]:
train_df[train_df.target == 0].text.values[1]

'I love fruits'

In [17]:
train_df['text'] = train_df.apply(lambda x: mix_all_text(x) ,axis=1)
test_df['text'] = test_df.apply(lambda x: mix_all_text(x) ,axis=1)

In [18]:
train_df[['text']].sample(10)

Unnamed: 0,text
4010,May Allah help all those suffering from the #P...
137,airplane%20accident Pennsylvania Strict liabil...
4512,The hurricane mixxtail kinda tastes like the w...
6641,"terrorist Sanganer, Rajasthan @rahulkanwal why..."
2763,"devastation Las Vegas, Nevada We haven't seen ..."
3435,Chick masturbates a guy until she gets explode...
6250,"snowstorm Neath, South Wales #NowPlaying Last ..."
2981,drowning New York The Drowning Girl by Caitlin...
1980,#Rohingya houses in #Kyee NockThie hamlet from...
2234,"deluge Fort Fizz, Ohio Vince McMahon once agai..."


## Data cleaning

In [19]:
train_df.text = cleaning_function(train_df.text)
test_df.text = cleaning_function(test_df.text)

In [20]:
train_df[['text']].sample(10)

Unnamed: 0,text
233,annihilated pa officially skipping out on fant...
1128,bombed warwick ri dollarocracy also storm in r...
2759,devastation contactsimplenewsgmailcom years a...
622,bioterrorism hudson valley ny volunteers neede...
433,localarsonist lmfao
5460,police toronto options to fix your facebook c...
5117,finnish nuclear plant to move ahead after fina...
4209,hazard uk our tipster previews chelsea v swans...
2912,i am that girl on tv that sadly turns her musi...
2750,devastation uk obsolete devastation from broad...


## Vectorize

### Using a simple count vextorizer

In [21]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [22]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 52)
[[0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [23]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

### Using Tf-idf (Term frequency - Inverse document frequency)

In [24]:
count_vectorizer = feature_extraction.text.TfidfVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [25]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 52)
[[0.         0.23336118 0.28924517 0.23336118 0.         0.
  0.         0.         0.         0.         0.28924517 0.28924517
  0.         0.         0.         0.         0.28924517 0.
  0.         0.         0.         0.         0.         0.28924517
  0.         0.         0.         0.28924517 0.         0.
  0.         0.         0.28924517 0.         0.         0.
  0.         0.28924517 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.28924517
  0.23336118 0.         0.28924517 0.        ]]


In [26]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

## LSA (Latent Semantic Analysis)

In [27]:
svd = decomposition.TruncatedSVD(n_components=30, n_iter=20, random_state=42)
train_matrix = svd.fit_transform(train_vectors)
test_matrix = svd.transform(test_vectors)

## Model

In [69]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
from sklearn import linear_model
clf = linear_model.RidgeClassifier()

In [70]:
scores = model_selection.cross_val_score(clf, train_matrix, train_df["target"], scoring="f1", cv=3)
print("Scores:", scores.mean(),scores.std())

Scores: 0.60326889879472 0.05383427566871838


In [71]:
clf.fit(train_matrix, train_df["target"])

RidgeClassifier()

## Submission

In [41]:
sample_submission = pd.read_csv("../data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_matrix)

In [42]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,1


In [43]:
sample_submission.to_csv("../data/submission.csv", index=False)

## Visual check

In [44]:
df_check = pd.DataFrame({'text':test_df['text'] , 'is_disaster': sample_submission["target"].astype(bool)}) 

In [45]:
df_check[df_check.is_disaster == True].sample(5)

Unnamed: 0,text,is_disaster
2442,fears over missing migrants in med rescuers se...,True
544,buildingsonfire arsehole squad blainescronuts ...,True
1575,fire san jose state university joseph fire,True
1917,injured nj az loganmeadows christianstec hes ...,True
1873,hostage visit our dedicated website is egypt ...,True


In [46]:
df_check[df_check.is_disaster == False].sample(5)

Unnamed: 0,text,is_disaster
1295,drowning morioh japan tinyjecht those eyes des...,False
3027,talplays twitchbang maliceqt zelse rhyareegami...,False
1425,live on periscope wild wing epicentre https tc...,False
983,the horror community is deluged with cruddy os...,False
2987,watching casino royale shes clearly traumatise...,False
