In [355]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, decomposition

## Data exploration

In [409]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [410]:
train_df[train_df.target == 1].text.values[1]

'Forest fire near La Ronge Sask. Canada'

In [411]:
train_df[train_df.target == 0].text.values[1]

'I love fruits'

In [433]:
def mix_all_text(x):
    if not (pd.isna(x.keyword) or pd.isna(x.location)):
        return str(x.keyword) + ' ' + str(x.location) + ' ' + str(x.text)  
    else:
        return x.text

In [434]:
train_df['text'] = train_df.apply(lambda x: mix_all_text(x) ,axis=1)
test_df['text'] = test_df.apply(lambda x: mix_all_text(x) ,axis=1)

In [456]:
train_df[['text']].sample(10)

Unnamed: 0,text
4277,heatwave liverpool heatwave greatbritishbakeof...
618,bioterrorism creation of ai climate change bio...
4019,floods who is bringing the tornadoes and flood...
1715,collided tennessee collided on page of of afte...
4464,hostages render assistance gain as proxy for y...
2886,drought at work drought mane im not a raiders ...
2358,demolition us pr demolition treyarch davidvond...
1105,bombed photo bombed http tco artumhmbhh
6450,suicidebombing australia suicidebombing erdoga...
1290,burned upper st clair pa burned thomasvissman ...


## Data cleaning

In [436]:
import regex as re
def cleaning_function(a):
    """
    function used for clining a list of strings
    """
    a1 = lower_array(a)
    a2 = remove_punct_array(a1)
    return a2

def lower_array(a):
    return [(str(word)).lower() for word in a]

def remove_punct(s):
    # replace - and / by space
    s0 = s.replace('-', ' ').replace('/', ' ')
    # replace 2+ spaces by 1 space
    t = re.compile(r"\s+")
    s1 = t.sub(" ", s0).strip()
    # remove punctuations
    s2 = re.sub(r'[^A-Za-z ]+', '', s1)
    # remove first space   
    s3 = re.sub('^\s', '', s2)
    # remove last space   
    s4 = s3.rstrip()
    return s4

def remove_punct_array(a):
    return [remove_punct(str(word)) for word in a]

In [437]:
train_df.text = cleaning_function(train_df.text)
test_df.text = cleaning_function(test_df.text)

## Vectorize

### Using a simple count vextorizer

In [438]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [439]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 52)
[[0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [440]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

### Using Tf-idf (Term frequency - Inverse document frequency)

In [441]:
count_vectorizer = feature_extraction.text.TfidfVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [442]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 52)
[[0.         0.23336118 0.28924517 0.23336118 0.         0.
  0.         0.         0.         0.         0.28924517 0.28924517
  0.         0.         0.         0.         0.28924517 0.
  0.         0.         0.         0.         0.         0.28924517
  0.         0.         0.         0.28924517 0.         0.
  0.         0.         0.28924517 0.         0.         0.
  0.         0.28924517 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.28924517
  0.23336118 0.         0.28924517 0.        ]]


In [443]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

## LSA (Latent Semantic Analysis)

In [444]:
svd = decomposition.TruncatedSVD(n_components=30, n_iter=20, random_state=42)
train_matrix = svd.fit_transform(train_vectors)
test_matrix = svd.transform(test_vectors)

## Model

In [445]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [448]:
scores = model_selection.cross_val_score(clf, train_matrix, train_df["target"], cv=3, scoring="f1")
scores

array([0.6122449 , 0.54136986, 0.66298343])

In [449]:
clf.fit(train_matrix, train_df["target"])

RidgeClassifier()

## Submission

In [450]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_matrix)

In [451]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,1


In [452]:
sample_submission.to_csv("data/submission.csv", index=False)

## Visual check

In [453]:
df_check = pd.DataFrame({'text':test_df['text'] , 'is_disaster': sample_submission["target"].astype(bool)}) 

In [454]:
df_check[df_check.is_disaster == True].sample(5)

Unnamed: 0,text,is_disaster
2418,rescue usa rescue beauty deals http tco eudept...,True
61,airplaneaccident california usa airplaneaccide...,True
547,buildingsonfire las vegas buildingsonfire ther...,True
2279,pandemonium nigeria pandemonium pandemonium in...,True
2022,loudbang kenya loudbang ibliz breaking news un...,True


In [455]:
df_check[df_check.is_disaster == False].sample(5)

Unnamed: 0,text,is_disaster
1451,evacuation my school is so fucking dumb they j...,False
3067,violentstorm hippiexox if the word violent com...,False
1771,hazard road hazard e confederate ave se morela...,False
3095,volcano baker louisiana volcano whats the hott...,False
744,collide lehigh acres fl collide collide gatewa...,False
