In [285]:
from nlp_jzar.py import *

## Data exploration

In [286]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [287]:
train_df[train_df.target == 1].text.values[1]

'Forest fire near La Ronge Sask. Canada'

In [288]:
train_df[train_df.target == 0].text.values[1]

'I love fruits'

In [227]:
train_df['text'] = train_df.apply(lambda x: mix_all_text(x) ,axis=1)
test_df['text'] = test_df.apply(lambda x: mix_all_text(x) ,axis=1)

In [228]:
train_df[['text']].sample(10)

Unnamed: 0,text
1266,What progress we are making. In the Middle Ag...
3358,I got evacuated from the cinema 30 mins throug...
3701,"fear sitting on the fence, New York @phnotf so..."
4363,School Bus Hijacker Given Parole After 39 Year...
5345,pandemonium illinois. united state Pandemoniu...
2145,deaths Does it really matter! Deaths 3 http://...
3903,flattened Delhi @twilightfairy flattened frog?
5041,mudslide Ireland @MarianKeyes Rubber Mudslide!...
1885,crushed wherever there's netflix BHAVANA'S MOM...
3777,"fire%20truck Woodcreek HS, Roseville, CA Your ..."


## Data cleaning

In [230]:
train_df.text = cleaning_function(train_df.text)
test_df.text = cleaning_function(test_df.text)

In [231]:
train_df[['text']].sample(10)

Unnamed: 0,text
1302,burning hampton roads va the minute fat burni...
1258,buildingsonfire ma via pa charred remains of h...
4811,i dont laugh out loud at many things but man i...
7006,white twister black shift knob mx thread size ...
2149,its time to reduce gun deaths http tco iladqebxpn
638,bioterrorism searching for bae the threat ant...
3240,he came to a land which was engulfed in tribal...
387,arson jerusalem mourning notices for stabbing ...
5619,refugees bc us asia or europe newlyweds feed s...
3944,flood knoxville tn flood advisory issued augus...


## Vectorize

### Using a simple count vextorizer

In [289]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [290]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [291]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

### Using Tf-idf (Term frequency - Inverse document frequency)

In [246]:
count_vectorizer = feature_extraction.text.TfidfVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [247]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 52)
[[0.         0.23336118 0.28924517 0.23336118 0.         0.
  0.         0.         0.         0.         0.28924517 0.28924517
  0.         0.         0.         0.         0.28924517 0.
  0.         0.         0.         0.         0.         0.28924517
  0.         0.         0.         0.28924517 0.         0.
  0.         0.         0.28924517 0.         0.         0.
  0.         0.28924517 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.28924517
  0.23336118 0.         0.28924517 0.        ]]


In [248]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

## LSA (Latent Semantic Analysis)

In [278]:
svd = decomposition.TruncatedSVD(n_components=30, n_iter=20, random_state=42)
train_matrix = svd.fit_transform(train_vectors)
test_matrix = svd.transform(test_vectors)

## Model

In [295]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [297]:
scores = model_selection.cross_val_score(clf, train_matrix, train_df["target"], scoring="f1", cv=5)
scores

array([0.6025641 , 0.50168919, 0.56985004, 0.50781969, 0.67275495])

In [298]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

## Submission

In [300]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_matrix)

In [301]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [302]:
sample_submission.to_csv("data/submission.csv", index=False)

## Visual check

In [243]:
df_check = pd.DataFrame({'text':test_df['text'] , 'is_disaster': sample_submission["target"].astype(bool)}) 

In [244]:
df_check[df_check.is_disaster == True].sample(5)

Unnamed: 0,text,is_disaster
1593,firetruck fredericksburg va last night was a b...,True
2279,pandemonium nigeria pandemonium in aba as woma...,True
2520,sandstorm usa watch this airport get swallowed...,True
321,bleeding london england tombrevoort bleeding c...,True
869,cyclone mansfield ohio localeventcountdown cu...,True


In [245]:
df_check[df_check.is_disaster == False].sample(5)

Unnamed: 0,text,is_disaster
31,i hate badging shit in accident,False
1659,flood new york pcs w cree led work light offro...,False
1894,deosl that does nit change the fact extracting...,False
1228,deepwater drill company gains m transocean one...,False
1407,emergencyservices anchorage ak providence heal...,False
