In [42]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer 

# EDA
Check relations b/w:
* keyword vs target
* location vs target
* target 1 & 0 ratio. (this ratio can be compared for training and test datasets)

In [21]:
train_data_og = pd.read_csv('./train.csv')
test_data_og = pd.read_csv('./test.csv')

print(train_data_og.dtypes)
display(train_data_og.sample(n= 5).style)
print(train_data_og.isnull().sum(), test_data_og.isnull().sum())

print(f"keyword vs target: \n{train_data_og[['keyword','target']].groupby(['keyword'])['target'].sum().sort_values(ascending=False)}")

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object


Unnamed: 0,id,keyword,location,text,target
6424,9185,suicide%20bomber,nigeria,Suicide Bomber Kills More Than a Dozen in Saudi Mosque: Saudi Arabia have started experiencing some terrorist ... http://t.co/GuAJ2t910b,1
489,709,attacked,MAURITIUS,Israeli helicopters that attacked civilians in Gaza just completed exercises in Greece.,1
2991,4298,dust%20storm,Idaho,@NWSPocatello BG-16: So far brunt of storm just to our north. Grayed out w/ dust & rain to N blue sky interspersed w/ clouds to S.,0
6877,9860,traumatised,Kirkwall,WHY THE DEEP ROADS THO HAHAHAHA IM SO TRAUMATISED BY THE DEEP ROADS LOLOL,0
7488,10710,wreck,new york,act my age was a MESS everyone was so wild it was so fun my videos a wreck,0


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64 id             0
keyword       26
location    1105
text           0
dtype: int64
keyword vs target: 
keyword
derailment     39
wreckage       39
outbreak       39
typhoon        37
debris         37
               ..
electrocute     1
epicentre       1
body%20bags     1
blazing         1
aftershock      0
Name: target, Length: 221, dtype: int64


# Feature Engineering
* (DONE) perform word tokenization
* (DONE) Remove stop words
* (DONE) Remove https links
* (DONE) Remove punctuations
* (DONE) lowercase strings
* (DONE) apply stemming : reducing words to their root words
* apply lemmatization? Study more about this
* vectorization: Bag of Words

In [106]:
train_data = train_data_og.copy()

train_data= train_data.replace(to_replace= {'text':{r'http\S+':'',r'[0-9]+':'',r'[^A-Za-z0-9 ]+':''}}, regex=True)

train_data['text'] = train_data['text'].str.lower().apply(word_tokenize)

stopwords_en = set(stopwords.words('english'))
punctuation_en = set(punctuation)
stopwords_punctuations_en = stopwords_en.union(punctuation_en)

train_data['text'] = train_data['text'].apply(lambda x: [word for word in x if word not in stopwords_punctuations_en and len(word)>2 ])

porter = PorterStemmer()
train_data['text'] = train_data['text'].apply(lambda x : [porter.stem(word) for word in x ] )

display(train_data.sample(n=5).style)

Unnamed: 0,id,keyword,location,text,target
4060,5769,forest%20fires,,"['heartdiseas', 'forest', 'servic', 'say', 'spend', 'half', 'budget', 'fire']",1
3028,4349,earthquake,Earth,"['earthquak', 'occur', 'near', 'mount', 'helen', 'area', 'washington', 'utc', 'earthquak']",1
4478,6370,hostages,,"['new', 'free', 'porn', 'clip', 'take', 'hostag', 'danger', 'favor', 'free', 'adult', 'sex', 'tube']",0
5368,7659,panic,Narnia,"['ad', 'video', 'youtub', 'playlist', 'panic', 'disco', 'girlsgirlsboy', 'offici', 'video']",0
485,702,attacked,"Texas, USA","['messeymetoo', 'feel', 'attack']",0


In [104]:
display(train_data[['text']].sample(n=10).style)
print(train_data.loc[4961,'text'])

Unnamed: 0,text
6324,"['stretcher', 'witter', 'rexyy', 'towel', 'show', 'pictur']"
3830,"['good', 'info', 'help', 'first', 'respond', 'cope', 'individu', 'resili', 'factsheet', 'respond']"
1665,"['like', 'youtub', 'video', 'sqwizzix', 'call', 'duti', 'piano', 'entertain', 'musician', 'collid']"
2330,"['three', 'home', 'demolish', 'unrecogn', 'arab', 'villag', 'intern', 'middl', 'east', 'media', 'center']"
1470,"['peterjuk', 'good', 'ground', 'believ', 'polit', 'militari', 'catastroph', 'crime', 'plan', 'commit', 'individu']"
53,"['polic', 'arsonist', 'deliber', 'set', 'black', 'church', 'north', 'carolinaablaz']"
936,"['blown', 'away', 'extens', 'noth', 'weve', 'seen', 'mani', 'option', 'one']"
4803,"['toxicsavior', 'loud', 'bang', 'froze', 'spot', 'slowli', 'everi', 'head', 'turn', 'toward', 'one', 'thing', 'hate']"
5851,"['well', 'done', 'everyon', 'applaud', 'terrif', 'never', 'ruin', 'anyth']"
6822,"['hollywood', 'movi', 'trap', 'miner', 'releas', 'chile', 'zippednew']"


['nprfreshair', 'realli', 'cant', 'believ', 'skip', 'republican', 'meltdowni', 'mean', 'debat']


## Observations:
* Some tokens contain: "'s", "--", decimal numbers etc.

# Model Selection
* Naive Bayes