In [242]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer 

# models
from sklearn.feature_extraction.text import CountVectorizer

# vizualization
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

# EDA
Check relations b/w:
* keyword vs target
* location vs target
* target 1 & 0 ratio. (this ratio can be compared for training and test datasets)

In [239]:
train_data_og = pd.read_csv('./train.csv')
test_data_og = pd.read_csv('./test.csv')

# print(train_data_og.dtypes)
# display(train_data_og.sample(n= 5).style)
# print(train_data_og.isnull().sum(), test_data_og.isnull().sum())

train_data = train_data_og.copy()
print(f"% of real disaster tweets: {np.round(np.sum(train_data['target'])/len(train_data)*100,2)}%")
train_data['target_sum'] = train_data.groupby('keyword')['target'].transform('sum')


plt.figure(figsize=(8, 72), dpi=100)
# sns.countplot(data = train_data, y = train_data.sort_values(by='target_sum', ascending= False)['keyword'],  hue='target')
train_data.drop(columns=['target_sum'], inplace=True)

train_data['target_sum'] = train_data.groupby('location')['target'].transform('sum')
# sns.countplot(data = train_data, y = train_data.sort_values(by='target_sum', ascending= False)['location'].head(800),  hue='target')
train_data.drop(columns=['target_sum'], inplace=True)

print(f"Unique keywords: {len(train_data['keyword'].unique())} and unique locations: {len(train_data['location'].unique())}")
print(f"Missing keywords in train set: {np.round(train_data['keyword'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing location in train set: {np.round(train_data['location'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing keywords in test set: {np.round(test_data_og['keyword'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing location in test set: {np.round(test_data_og['location'].isnull().sum()/len(train_data)*100,2)}%")


% of real disaster tweets: 42.97%
Unique keywords: 222 and unique locations: 3342
Missing keywords in train set: 0.8%
Missing location in train set: 33.27%
Missing keywords in test set: 0.34%
Missing location in test set: 14.51%


<Figure size 800x7200 with 0 Axes>

## Observation:
* We have almost a balanced set for classification. 42.97% are real and rest are fake disasters.
* Unique keywords: 222 and unique locations: 3342 (including null)
* Clearly, many keywords have high/low ratio of target count. ie, they can be used to identify target.
* % of missing locations is high in train set (>30%) and test set (>10%).
* locations are not good indicator of a real/fake disaster. Only a few of them show a correlation. No need to use this as a feature, for now.

# Feature Engineering
* Deal with missing keywords
* (DONE) perform word tokenization
* (DONE) Remove stop words
* (DONE) Remove https links: checking their relavence is out of scope.
* (DONE) Remove punctuations
* (DONE) lowercase strings
* (DONE) apply stemming : reducing words to their stem/root words by removing suffixes
* (X) apply lemmatization: reducing words to its lexeme form or inflected form. words used in the same context.
* vectorization: Bag of Words

Lemmatization is more complex than stemming:
* it needs parts of speech, if its done for individual words otherwise there is no way to understand the context of the word.
* to get this POS we need a lookup dictionary like WordNet (by princeton) and then convert this tag to a tag that nltk will understand.
* no need to do stemming and lemmatization together. choose the one which is good enough. Ofc, lemmatization is more like fine tuning.
* Apply lemmatization before removing stop words.

In [240]:
# remove urls, numbers, eveything except strings, alphanum,hashtags and spaces.
train_data= train_data.replace(to_replace= {'text':{r'http\S+':'',r'[0-9]+':'',r'[^A-Za-z0-9# ]+':''}}, regex=True)

train_data['text'] = train_data['text'].str.lower().apply(word_tokenize)

stopwords_en = set(stopwords.words('english'))
punctuation_en = set(punctuation)
stopwords_punctuations_en = stopwords_en.union(punctuation_en)

train_data['text'] = train_data['text'].apply(lambda x: [word for word in x if word not in stopwords_punctuations_en and len(word)>2 ])

porter = PorterStemmer()
train_data['text'] = train_data['text'].apply(lambda x : [porter.stem(word) for word in x ] )

display(train_data.sample(n=5).style)

Unnamed: 0,id,keyword,location,text,target
707,1021,blazing,New York,"['morgan', 'silver', 'dollar', 'gem', 'blaze', 'satin', 'rare', 'proof', 'like', 'full']",0
6043,8638,seismic,,"['subcontractor', 'work', 'french', 'seismic', 'survey', 'group', 'cgg', 'kidnap', 'cairo', 'held', 'islam', 'state', 'compani', 'said']",1
2931,4212,drowned,"Dreieich, Germany","['via', 'dwenglish', 'hundr', 'fear', 'drown', 'migrant', 'boat', 'capsiz', 'libya', 'ufoublogeurop']",1
4418,6281,hijacking,,"['murder', 'stori', 'america', 'first', 'hijack']",1
2338,3364,demolition,"Murray Hill, New Jersey","['remain', 'section', 'greyston', 'psychiatr', 'hospit', 'demolit', 'pic']",0


## Observations:
* (DONE) Some tokens contain: "'s", "--", decimal numbers etc.
* (DONE) Tweets containing consecutive # hashtags are concatenated together because we remove special chars first then tokenize. This was many tokens will be unused for classification. Check train_data.loc[6626,'text']

# Model Selection
* Bag of Words
* TFIDF
* Naive Bayes

In [243]:
bow = CountVectorizer(preprocessor=preprocess_text)
x = bow.fit_transform(train_data['text'])

print(x)


TypeError: expected string or bytes-like object