In [436]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer 
from io import StringIO

# models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import RidgeClassifier

# vizualization
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

# EDA
Check relations b/w:
* keyword vs target
* location vs target
* target 1 & 0 ratio. (this ratio can be compared for training and test datasets)

In [437]:
train_data_og = pd.read_csv('./train.csv')
test_data_og = pd.read_csv('./test.csv')

# print(train_data_og.dtypes)
# display(train_data_og.sample(n= 5).style)
# print(train_data_og.isnull().sum(), test_data_og.isnull().sum())

train_data = train_data_og.copy()
test_data = test_data_og.copy()
print(f"% of real disaster tweets: {np.round(np.sum(train_data['target'])/len(train_data)*100,2)}%")
train_data['target_sum'] = train_data.groupby('keyword')['target'].transform('sum')


plt.figure(figsize=(8, 72), dpi=100)
# sns.countplot(data = train_data, y = train_data.sort_values(by='target_sum', ascending= False)['keyword'],  hue='target')
train_data.drop(columns=['target_sum'], inplace=True)

train_data['target_sum'] = train_data.groupby('location')['target'].transform('sum')
# sns.countplot(data = train_data, y = train_data.sort_values(by='target_sum', ascending= False)['location'].head(800),  hue='target')
train_data.drop(columns=['target_sum'], inplace=True)

print(f"Unique keywords: {len(train_data['keyword'].unique())} and unique locations: {len(train_data['location'].unique())}")
print(f"Missing keywords in train set: {np.round(train_data['keyword'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing location in train set: {np.round(train_data['location'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing keywords in test set: {np.round(test_data_og['keyword'].isnull().sum()/len(train_data)*100,2)}%")
print(f"Missing location in test set: {np.round(test_data_og['location'].isnull().sum()/len(train_data)*100,2)}%")


% of real disaster tweets: 42.97%
Unique keywords: 222 and unique locations: 3342
Missing keywords in train set: 0.8%
Missing location in train set: 33.27%
Missing keywords in test set: 0.34%
Missing location in test set: 14.51%


<Figure size 800x7200 with 0 Axes>

## Observation:
* We have almost a balanced set for classification. 42.97% are real and rest are fake disasters.
* Unique keywords: 222 and unique locations: 3342 (including null)
* Clearly, many keywords have high/low ratio of target count. ie, they can be used to identify target.
* % of missing locations is high in train set (>30%) and test set (>10%).
* locations are not good indicator of a real/fake disaster. Only a few of them show a correlation. No need to use this as a feature, for now.

# Feature Engineering
* Deal with missing keywords
* (DONE) perform word tokenization
* (DONE) Remove stop words
* (DONE) Remove https links: checking their relavence is out of scope.
* (DONE) Remove punctuations
* (DONE) lowercase strings
* (DONE) apply stemming : reducing words to their stem/root words by removing suffixes
* (X) apply lemmatization: reducing words to its lexeme form or inflected form. words used in the same context.


Lemmatization is more complex than stemming:
* it needs parts of speech, if its done for individual words otherwise there is no way to understand the context of the word.
* to get this POS we need a lookup dictionary like WordNet (by princeton) and then convert this tag to a tag that nltk will understand.
* no need to do stemming and lemmatization together. choose the one which is good enough. Ofc, lemmatization is more like fine tuning.
* Apply lemmatization before removing stop words.  

In [438]:
# remove urls, numbers, eveything except strings, alphanum,hashtags and spaces.

stopwords_json_en = set(["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"])
stopwords_en = set(stopwords.words('english'))
punctuation_en = set(punctuation)
stopwords_punctuations_en = set.union(stopwords_json_en,stopwords_en,punctuation_en)
porter = PorterStemmer()

combined_dataset = [train_data,test_data]

for dataset in combined_dataset:

    dataset.replace(to_replace= {'text':{r'http\S+':'',r'[0-9]+':'',r'@\S+':'', r'[^A-Za-z0-9# ]+':''}}, regex=True, inplace=True)

    dataset['text'] = dataset['text'].str.lower().apply(word_tokenize)

    dataset['text'] = dataset['text'].apply(lambda x: [word for word in x if word not in stopwords_punctuations_en and len(word)>2 ])

    dataset['text'] = dataset['text'].apply(lambda x : ' '.join([porter.stem(word) for word in x ]) ) 
    #converting back to string because CountVectorizer accepts Sting and not list




## Observations:
* (DONE) Some tokens contain: "'s", "--", decimal numbers etc.
* (DONE) Tweets containing consecutive # hashtags are concatenated together because we remove special chars first then tokenize. This was many tokens will be unused for classification. Check train_data.loc[6626,'text']
* (DONE) @someone should be removed all together and not just removing @. since this is a use handle and these words will be unnecessary in the dictionary. id: 3456
* Use stronger/longer list of stopwords.

# Vectorization
* (DONE) Bag of Words
* (DONE) TFIDF  

In [439]:
bow = TfidfVectorizer(stop_words=None, analyzer='word')
display(train_data.sample(n=5).style)
print(train_data['text'].dtype)
x = bow.fit_transform(train_data['text'])

print(len(bow.get_feature_names_out())) 
# before preprocessing: 21637 features
# after preprocessing: 11328 features

Unnamed: 0,id,keyword,location,text,target
5300,7570,outbreak,,famili sue legionnair famili affect fatal outbreak legionnair disea,1
6490,9278,sunk,NYC,ltlt lip sunk bed arm cross head watch captain number bodi,0
6775,9707,tornado,,pretti teen hayden ryan pose strip purpl top view download video,0
5586,7971,razed,WorldWide,news latest home raze northern california wildfir york time taf,1
1269,1831,burned,"Erie, PA",dont burn,0


object
11328


# Model Selection
* Multinomial Naive Bayes: good for text classification where data is represented as word counts, ie multinomially distributed data.
* Complement Naive Bayes: faster and better than MNB. Takes features not present in a class for all documents, to learn.
* Ridge Regression.

In [440]:


train_text, test_text, train_labels, test_labels = train_test_split(train_data['text'],train_data['target'])

bow = TfidfVectorizer(stop_words=None, analyzer='word')
train_features = bow.fit_transform(train_text)
test_features = bow.transform(test_text)

mnb = MultinomialNB(force_alpha=True)
mnb.fit(train_features, train_labels)

print(f"Multinomial NB classifier accuracy: {np.round(mnb.score(test_features,test_labels),2)*100}%")



Multinomial NB classifier accuracy: 80.0%


In [441]:
cnb = ComplementNB(force_alpha = True)
cnb.fit(train_features,train_labels)

print(f"Complement NB classifier accuracy: {np.round(cnb.score(test_features,test_labels),2)*100}%")


Complement NB classifier accuracy: 80.0%


In [442]:
rc = RidgeClassifier()
rc.fit(train_features, train_labels)

print(f"Ridge classifier accuracy: {np.round(rc.score(test_features,test_labels),2)*100}%")

Ridge classifier accuracy: 80.0%


# Final submission

In [443]:
mnb = MultinomialNB(force_alpha=True)
mnb.fit(bow.fit_transform(train_data['text']),train_data['target'])

predictions = mnb.predict(bow.transform(test_data['text']))

result = pd.DataFrame({'id': test_data['id'], 'target': predictions})

result.to_csv('./final_submission.csv',index=False)

## Score:
* Without preprocessing: 0.79619
* After preprocssing: 0.79221

Observations:  
* Preprocessing did not help much. maybe try different vectorization technique, NN model.
* Using stronger stopwords brought down acc by 0.8%
* Using TFIDF over BOW did not change the score.
