In [283]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer 
from io import StringIO

# models
from sklearn.feature_extraction.text import CountVectorizer

# vizualization
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

# EDA
Check relations b/w:
* keyword vs target
* location vs target
* target 1 & 0 ratio. (this ratio can be compared for training and test datasets)

In [308]:
train_data_og = pd.read_csv('./train.csv')
test_data_og = pd.read_csv('./test.csv')

# print(train_data_og.dtypes)
# display(train_data_og.sample(n= 5).style)
# print(train_data_og.isnull().sum(), test_data_og.isnull().sum())

train_data = train_data_og.copy()
print(f"% of real disaster tweets: {np.round(np.sum(train_data['target'])/len(train_data)*100,2)}%")
train_data['target_sum'] = train_data.groupby('keyword')['target'].transform('sum')


plt.figure(figsize=(8, 72), dpi=100)
# sns.countplot(data = train_data, y = train_data.sort_values(by='target_sum', ascending= False)['keyword'],  hue='target')
train_data.drop(columns=['target_sum'], inplace=True)

train_data['target_sum'] = train_data.groupby('location')['target'].transform('sum')
# sns.countplot(data = train_data, y = train_data.sort_values(by='target_sum', ascending= False)['location'].head(800),  hue='target')
train_data.drop(columns=['target_sum'], inplace=True)

print(f"Unique keywords: {len(train_data['keyword'].unique())} and unique locations: {len(train_data['location'].unique())}")
print(f"Missing keywords in train set: {np.round(train_data['keyword'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing location in train set: {np.round(train_data['location'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing keywords in test set: {np.round(test_data_og['keyword'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing location in test set: {np.round(test_data_og['location'].isnull().sum()/len(train_data)*100,2)}%")


% of real disaster tweets: 42.97%
Unique keywords: 222 and unique locations: 3342
Missing keywords in train set: 0.8%
Missing location in train set: 33.27%
Missing keywords in test set: 0.34%
Missing location in test set: 14.51%


<Figure size 800x7200 with 0 Axes>

## Observation:
* We have almost a balanced set for classification. 42.97% are real and rest are fake disasters.
* Unique keywords: 222 and unique locations: 3342 (including null)
* Clearly, many keywords have high/low ratio of target count. ie, they can be used to identify target.
* % of missing locations is high in train set (>30%) and test set (>10%).
* locations are not good indicator of a real/fake disaster. Only a few of them show a correlation. No need to use this as a feature, for now.

# Feature Engineering
* Deal with missing keywords
* (DONE) perform word tokenization
* (DONE) Remove stop words
* (DONE) Remove https links: checking their relavence is out of scope.
* (DONE) Remove punctuations
* (DONE) lowercase strings
* (DONE) apply stemming : reducing words to their stem/root words by removing suffixes
* (X) apply lemmatization: reducing words to its lexeme form or inflected form. words used in the same context.


Lemmatization is more complex than stemming:
* it needs parts of speech, if its done for individual words otherwise there is no way to understand the context of the word.
* to get this POS we need a lookup dictionary like WordNet (by princeton) and then convert this tag to a tag that nltk will understand.
* no need to do stemming and lemmatization together. choose the one which is good enough. Ofc, lemmatization is more like fine tuning.
* Apply lemmatization before removing stop words.  

In [309]:
# remove urls, numbers, eveything except strings, alphanum,hashtags and spaces.
train_data= train_data.replace(to_replace= {'text':{r'http\S+':'',r'[0-9]+':'',r'[^A-Za-z0-9# ]+':''}}, regex=True)

train_data['text'] = train_data['text'].str.lower().apply(word_tokenize)

stopwords_en = set(stopwords.words('english'))
punctuation_en = set(punctuation)
stopwords_punctuations_en = stopwords_en.union(punctuation_en)

train_data['text'] = train_data['text'].apply(lambda x: [word for word in x if word not in stopwords_punctuations_en and len(word)>2 ])

porter = PorterStemmer()
train_data['text'] = train_data['text'].apply(lambda x : ' '.join([porter.stem(word) for word in x ]) ) 
#converting back to string because CountVectorizer inputs Sting and not list

display(train_data.sample(n=5).style)

Unnamed: 0,id,keyword,location,text,target
3998,5677,floods,,dead due flood myanmar naypyidaw aug prensa latina death toll rose today myanmar,1
6589,9435,survivors,Anywhere Safe,lawfulsurvivor tdog hole apart store sever survivor glenn moral andrea jacqui merl,1
172,247,ambulance,Jackson,twelv fear kill pakistani air ambul helicopt crash,1
7178,10287,weapon,//RP\ ot @Mort3mer\\,honey aint angel like scream word weapon well ahead take best shot woman wan leav,0
1928,2771,curfew,,aptlyengineerd curfew,0


## Observations:
* (DONE) Some tokens contain: "'s", "--", decimal numbers etc.
* (DONE) Tweets containing consecutive # hashtags are concatenated together because we remove special chars first then tokenize. This was many tokens will be unused for classification. Check train_data.loc[6626,'text']

# Vectorization
* Bag of Words
* TFIDF  
We use the preprocessing steps tested out above to create a func.

In [311]:
bow = CountVectorizer(stop_words=None, analyzer='word')
display(train_data.sample(n=5).style)
print(train_data['text'].dtype)
x = bow.fit_transform(train_data['text'])

print(x)


Unnamed: 0,id,keyword,location,text,target
33,50,ablaze,AFRICA,africanbaz break newsnigeria flag set ablaz aba,1
2766,3975,devastation,Devon/London,devast smash phone,0
804,1167,blight,"Vancouver, BC",parksboardfact first zippolin one want use commun never ask blight park moveit,0
527,762,avalanche,Score Team Goals Buying @,tix calgari flame col avalanch preseason scotiabank saddledom,0
371,531,army,,beyonc pick fan armi beyhiv,0


object
  (0, 3029)	1
  (0, 9763)	1
  (0, 3614)	1
  (0, 7434)	1
  (0, 321)	1
  (0, 4451)	1
  (1, 4443)	1
  (1, 4293)	1
  (1, 8144)	1
  (1, 10181)	1
  (1, 10423)	1
  (1, 1805)	1
  (2, 9954)	1
  (2, 697)	1
  (2, 10738)	2
  (2, 9149)	2
  (2, 8387)	1
  (2, 8520)	1
  (2, 3947)	1
  (2, 8685)	1
  (2, 4012)	1
  (3, 3947)	1
  (3, 8685)	1
  (3, 8988)	1
  (3, 9780)	1
  :	:
  (7609, 12037)	1
  (7610, 12961)	1
  (7610, 5255)	1
  (7610, 12774)	1
  (7611, 1853)	1
  (7611, 9232)	1
  (7611, 10052)	1
  (7611, 5913)	1
  (7611, 6993)	1
  (7611, 10641)	1
  (7611, 6004)	1
  (7611, 12096)	1
  (7611, 11561)	1
  (7611, 2370)	1
  (7611, 3635)	2
  (7611, 9278)	1
  (7611, 8334)	1
  (7612, 13251)	1
  (7612, 1766)	1
  (7612, 5493)	1
  (7612, 8210)	1
  (7612, 6745)	1
  (7612, 8362)	1
  (7612, 21)	1
  (7612, 9720)	1


# Model Selection
* Multinomial Naive Bayes: good for text classification where data is represented as word counts, ie multinomially distributed data.
* Complement Naive Bayes: faster and better than MNB. Takes features not present in a class for all documents, to learn.
* Ridge Regression.