In [33]:
import pandas as pd
import numpy as np
import requests
import time
from sklearn.pipeline                import Pipeline
from sklearn.model_selection         import train_test_split, GridSearchCV
from sklearn.linear_model            import LogisticRegression
from sklearn.ensemble                import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes             import GaussianNB, MultinomialNB
from sklearn.tree                    import DecisionTreeClassifier
from sklearn.feature_extraction      import _stop_words
import re
import string
import gensim
%matplotlib inline

In [16]:
train = pd.read_csv('disaster_relevance_TRAIN.csv')

In [17]:
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [18]:
df.shape

(10876, 13)

In [19]:
df.columns


Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'choose_one', 'choose_one:confidence',
       'choose_one_gold', 'keyword', 'location', 'text', 'tweetid', 'userid'],
      dtype='object')

In [20]:
df.dtypes

_unit_id                   int64
_golden                     bool
_unit_state               object
_trusted_judgments         int64
_last_judgment_at         object
choose_one                object
choose_one:confidence    float64
choose_one_gold           object
keyword                   object
location                  object
text                      object
tweetid                  float64
userid                   float64
dtype: object

In [21]:
train = train[['choose_one', 'text']]

In [22]:
train.isnull().sum()

choose_one    0
text          0
dtype: int64

In [23]:
print(train.shape)
train['choose_one'].unique()

(10876, 2)


array(['Relevant', 'Not Relevant', "Can't Decide"], dtype=object)

In [24]:
train = train[train['choose_one'] != "Can't Decide"]
print(train.shape)
train['choose_one'].unique()

(10860, 2)


array(['Relevant', 'Not Relevant'], dtype=object)

In [25]:
X = train['text']
y = train['choose_one'].map({'Relevant':1, 'Not Relevant':0})

X[0], y[0]

('Just happened a terrible car crash', 1)

In [26]:
y.value_counts(normalize=True)

0    0.569705
1    0.430295
Name: choose_one, dtype: float64

In [30]:

def clean(text):
    
    #URLs
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    
    #HTMLs
    text = re.sub(r'<.*?>', '', text)
    
    #Emojis
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    text = emojis.sub(r'', text)
    
    #Line breaks
    text = re.sub(r'\n', '', text)
    
    # Alphabets only
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    return text
    
X = X.apply(clean)

In [31]:

def clean2(tweet):
  # Acronyms and miswritten words
  tweet = re.sub(r"Typhoon-Devastated", "typhoon devastated", tweet)
  tweet = re.sub(r"TyphoonDevastated", "typhoon devastated", tweet)
  tweet = re.sub(r"typhoondevastated", "typhoon devastated", tweet)
  tweet = re.sub(r"MH370", "Malaysia Airlines Flight", tweet)
  tweet = re.sub(r"MH", "Malaysia Airlines Flight", tweet)
  tweet = re.sub(r"mh370", "Malaysia Airlines Flight", tweet)
  tweet = re.sub(r"year-old", "years old", tweet)
  tweet = re.sub(r"yearold", "years old", tweet)
  tweet = re.sub(r"yr old", "years old", tweet)
  tweet = re.sub(r"PKK", "Kurdistan Workers Party", tweet)
  tweet = re.sub(r"MP", "madhya pradesh", tweet)
  tweet = re.sub(r"rly", "railway", tweet)
  tweet = re.sub(r"CDT", "Central Daylight Time", tweet)
  tweet = re.sub(r"sensorsenso", "sensor senso", tweet)
  tweet = re.sub(r"pm", "", tweet)
  tweet = re.sub(r"PM", "", tweet)
  tweet = re.sub(r"nan", '', tweet)
  tweet = re.sub(r"terrorismturn", "terrorism turn", tweet)
  tweet = re.sub(r"epicente", "epicenter", tweet)
  tweet = re.sub(r"epicenterr", "epicenter", tweet)
  tweet = re.sub(r"WAwildfire", "Washington Wildfire", tweet)
  tweet = re.sub(r"prebreak", "pre break", tweet)
  tweet = re.sub(r"nowplaying", "now playing", tweet)
  tweet = re.sub(r"RT", "retweet", tweet)
  tweet = re.sub(r"EbolaOutbreak", "Ebola Outbreak", tweet)
  tweet = re.sub(r"LondonFire", "London Fire", tweet)
  tweet = re.sub(r"IDFire", "Idaho Fire", tweet)
  tweet = re.sub(r"withBioterrorism&use", "with Bioterrorism & use", tweet)
  tweet = re.sub(r"NASAHurricane", "NASA Hurricane", tweet)
  tweet = re.sub(r"withweapons", "with weapons", tweet)
  tweet = re.sub(r"NuclearPower", "Nuclear Power", tweet)
  tweet = re.sub(r"WhiteTerrorism", "White Terrorism", tweet)
  tweet = re.sub(r"MyanmarFlood", "Myanmar Flood", tweet)
  tweet = re.sub(r"ExtremeWeather", "Extreme Weather", tweet)

  # Special characters
  tweet = re.sub(r"%20", "", tweet)
  tweet = re.sub(r"%", "", tweet)
  tweet = re.sub(r"@", "", tweet)
  tweet = re.sub(r"#", '', tweet)
  tweet = re.sub(r"'", '', tweet)
  tweet = re.sub(r"\x89û_", '', tweet)
  tweet = re.sub(r"\x89ûò", '', tweet)
  tweet = re.sub(r"16yr", "16 year", tweet)
  tweet = re.sub(r"re\x89û_", '', tweet)
  tweet = re.sub(r"\x89û", '', tweet)
  tweet = re.sub(r"\x89Û", '', tweet)
  tweet = re.sub(r"re\x89Û", "re ", tweet)
  tweet = re.sub(r"re\x89û", "re ", tweet)
  tweet = re.sub(r"\x89ûª", "'", tweet)
  tweet = re.sub(r"\x89û", '', tweet)
  tweet = re.sub(r"\x89ûò", '', tweet)
  tweet = re.sub(r"\x89Û_", "", tweet)
  tweet = re.sub(r"\x89ÛÒ", "", tweet)
  tweet = re.sub(r"\x89ÛÓ", "", tweet)
  tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
  tweet = re.sub(r"\x89ÛÏ", "", tweet)
  tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
  tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
  tweet = re.sub(r"\x89Û÷", "", tweet)
  tweet = re.sub(r"\x89Ûª", "", tweet)
  tweet = re.sub(r"\x89Û\x9d", "", tweet)
  tweet = re.sub(r"å_", "", tweet)
  tweet = re.sub(r"\x89Û¢", "", tweet)
  tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
  tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
  tweet = re.sub(r"åÊ", "", tweet)
  tweet = re.sub(r"åÈ", "", tweet)
  tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
  tweet = re.sub(r"Ì©", "e", tweet)
  tweet = re.sub(r"å¨", "", tweet)
  tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
  tweet = re.sub(r"åÇ", "", tweet)
  tweet = re.sub(r"å£3million", f"3 million", tweet)
  tweet = re.sub(r"åÀ", "", tweet)

  # Contractions
  tweet = re.sub(r"he's", "he is", tweet)
  tweet = re.sub(r"there's", "there is", tweet)
  tweet = re.sub(r"We're", "We are", tweet)
  tweet = re.sub(r"That's", "That is", tweet)
  tweet = re.sub(r"won't", "will not", tweet)
  tweet = re.sub(r"they're", "they are", tweet)
  tweet = re.sub(r"Can't", "Cannot", tweet)
  tweet = re.sub(r"wasn't", "was not", tweet)
  tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
  tweet = re.sub(r"aren't", "are not", tweet)
  tweet = re.sub(r"isn't", "is not", tweet)
  tweet = re.sub(r"What's", "What is", tweet)
  tweet = re.sub(r"haven't", "have not", tweet)
  tweet = re.sub(r"hasn't", "has not", tweet)
  tweet = re.sub(r"There's", "There is", tweet)
  tweet = re.sub(r"He's", "He is", tweet)
  tweet = re.sub(r"It's", "It is", tweet)
  tweet = re.sub(r"You're", "You are", tweet)
  tweet = re.sub(r"I'M", "I am", tweet)
  tweet = re.sub(r"Im", "I am", tweet)
  tweet = re.sub(r"shouldn't", "should not", tweet)
  tweet = re.sub(r"wouldn't", "would not", tweet)
  tweet = re.sub(r"i'm", "I am", tweet)
  tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
  tweet = re.sub(r"I'm", "I am", tweet)
  tweet = re.sub(r"Isn't", "is not", tweet)
  tweet = re.sub(r"Here's", "Here is", tweet)
  tweet = re.sub(r"you've", "you have", tweet)
  tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
  tweet = re.sub(r"we're", "we are", tweet)
  tweet = re.sub(r"what's", "what is", tweet)
  tweet = re.sub(r"couldn't", "could not", tweet)
  tweet = re.sub(r"we've", "we have", tweet)
  tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
  tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
  tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
  tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
  tweet = re.sub(r"who's", "who is", tweet)
  tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
  tweet = re.sub(r"y'all", "you all", tweet)
  tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
  tweet = re.sub(r"would've", "would have", tweet)
  tweet = re.sub(r"it'll", "it will", tweet)
  tweet = re.sub(r"we'll", "we will", tweet)
  tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
  tweet = re.sub(r"We've", "We have", tweet)
  tweet = re.sub(r"he'll", "he will", tweet)
  tweet = re.sub(r"Y'all", "You all", tweet)
  tweet = re.sub(r"Weren't", "Were not", tweet)
  tweet = re.sub(r"Didn't", "Did not", tweet)
  tweet = re.sub(r"they'll", "they will", tweet)
  tweet = re.sub(r"they'd", "they would", tweet)
  tweet = re.sub(r"DON'T", "DO NOT", tweet)
  tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
  tweet = re.sub(r"they've", "they have", tweet)
  tweet = re.sub(r"i'd", "I would", tweet)
  tweet = re.sub(r"should've", "should have", tweet)
  tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
  tweet = re.sub(r"where's", "where is", tweet)
  tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
  tweet = re.sub(r"we'd", "we would", tweet)
  tweet = re.sub(r"i'll", "I will", tweet)
  tweet = re.sub(r"weren't", "were not", tweet)
  tweet = re.sub(r"They're", "They are", tweet)
  tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
  tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
  tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
  tweet = re.sub(r"let's", "let us", tweet)
  tweet = re.sub(r"it's", "it is", tweet)
  tweet = re.sub(r"can't", "can not", tweet)
  tweet = re.sub(r"cant", "can not", tweet)
  tweet = re.sub(r"don't", "do not", tweet)
  tweet = re.sub(r"dont", "do not", tweet)
  tweet = re.sub(r"you're", "you are", tweet)
  tweet = re.sub(r"i've", "I have", tweet)
  tweet = re.sub(r"that's", "that is", tweet)
  tweet = re.sub(r"i'll", "I will", tweet)
  tweet = re.sub(r"doesn't", "does not", tweet)
  tweet = re.sub(r"i'd", "I would", tweet)
  tweet = re.sub(r"didn't", "did not", tweet)
  tweet = re.sub(r"ain't", "am not", tweet)
  tweet = re.sub(r"you'll", "you will", tweet)
  tweet = re.sub(r"I've", "I have", tweet)
  tweet = re.sub(r"Don't", "do not", tweet)
  tweet = re.sub(r"I'll", "I will", tweet)
  tweet = re.sub(r"I'd", "I would", tweet)
  tweet = re.sub(r"Let's", "Let us", tweet)
  tweet = re.sub(r"you'd", "You would", tweet)
  tweet = re.sub(r"It's", "It is", tweet)
  tweet = re.sub(r"Ain't", "am not", tweet)
  tweet = re.sub(r"Haven't", "Have not", tweet)
  tweet = re.sub(r"Could've", "Could have", tweet)
  tweet = re.sub(r"youve", "you have", tweet)  
  tweet = re.sub(r"donå«t", "do not", tweet)

  return tweet

X = X.apply(clean2)

In [34]:
def clean3(text):
    
    #Punctuations
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    
    #Extra white-space
    text = re.sub(r' +', ' ', text)
    
    return text.lower()

X = X.apply(clean3)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42, 
                                                    stratify=y,
                                                    test_size=0.2)

In [36]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

# Shapes
print('X_train/test: ', X_train.shape,X_test.shape)
print('y_train/test: ', y_train.shape,y_test.shape)

X_train/test:  (8688,) (2172,)
y_train/test:  (8688,) (2172,)


In [10]:
X = df['text'] #Features matrix
y = df['keyword'] #Target value

# Modeling

In [37]:
y.value_counts()

0    6187
1    4673
Name: choose_one, dtype: int64

In [38]:
#baseline model accuracy
baseline_accuracy = y.value_counts()[1]/(y.shape[0])
baseline_accuracy

0.4302946593001842

In [39]:
#Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#Instantiating a pipeline for CountVectorizer
pipe_cvec = Pipeline(steps = [('vectorizer', CountVectorizer()), 
                        ('model', LogisticRegression())])

#Setting grid search parameters
hyperparams_cvec = {'vectorizer__max_features':[1000,2500,5000],
                    'vectorizer__ngram_range':[(1,1),(2,2),(1,2), (1,3),(2,3),(3,3)],
                    'vectorizer__stop_words':[None, 'english']
                   }

#Instantiating a pipeline for TF-IDF Vectorizer
pipe_tfidf = Pipeline(steps = [('vectorizer', TfidfVectorizer()), 
                        ('model', LogisticRegression())])

#Setting grid search parameters
hyperparams_tfidf = {'vectorizer__max_features':[1000,2500,5000],
                    'vectorizer__ngram_range':[(1,1),(2,2),(1,2), (1,3),(2,3),(3,3)],
                    'vectorizer__stop_words':[None, 'english'],
                   }
#Instantiating grid search with 3-fold cross-validation
gs_cvec = GridSearchCV(pipe_cvec,
                      hyperparams_cvec,
                      cv=3,
                       n_jobs=-1)
gs_tfidf = GridSearchCV(pipe_tfidf,
                       hyperparams_tfidf,
                       cv=3,
                       n_jobs=-1)

#Fitting grid search
#results_cvec = gs_cvec.fit(X_train,y_train)
#results_tfidf = gs_tfidf.fit(X_train,y_train)

In [40]:
results_cvec = gs_cvec.fit(X_train,y_train)
results_tfidf = gs_tfidf.fit(X_train,y_train)

In [41]:
#cvec best score
results_cvec.best_score_

0.7959484346224678

In [42]:

#Best accuracy on the training set for a CountVectorizer pipeline
results_cvec.score(X_train,y_train)

0.8831184775936157

In [43]:
#Best accuracy on the training set for a TF-IDF pipeline
results_tfidf.score(X_train,y_train)

0.8699815837937385

In [44]:
#Our best Logistic Regression model's parameters 
results_cvec.best_estimator_

Pipeline(steps=[('vectorizer',
                 CountVectorizer(max_features=2500, stop_words='english')),
                ('model', LogisticRegression())])

In [45]:
round(results_cvec.score(X_train,y_train),4)

0.8831

In [46]:
round(results_cvec.best_score_, 4)

0.7959

In [47]:
#Initializing a dictionary to store model scores
scores = {}

#Adding our scores
scores.update({'LR_cvec':[round(results_cvec.score(X_train,y_train),4)*100, round(results_cvec.best_score_, 4)*100 ]})
scores

{'LR_cvec': [88.31, 79.59]}

## Naive bayes

In [48]:
# Instantiating model
mnb = MultinomialNB()

#Instantiating Count Vectorizer
cvec=CountVectorizer()

#Vectorizing Transforming our features matriz
X_mnb = cvec.fit_transform(X)

#Trasforming our sparse matrix to array
X_mnb = X_mnb.toarray()

#Train-test split with stratification for this particular method
X_train_mnb, X_test_mnb, y_train_mnb, y_test_mnb = train_test_split(X_mnb, y, random_state=123)

#Fitting the model
result_mnb = mnb.fit(X_train_mnb,y_train_mnb)

In [49]:
#Getting model's accuracy on trainig set
result_mnb.score(X_train_mnb, y_train_mnb)

0.9010435850214856

In [50]:
#Getting model's accuracy on testing set
result_mnb.score(X_test_mnb, y_test_mnb)

0.8007366482504604

In [51]:
#Updating scores dictionary
scores.update({'Naive Bayes':[round(result_mnb.score(X_train_mnb, y_train_mnb),4)*100, 
                              round(result_mnb.score(X_test_mnb, y_test_mnb),4)*100]})

scores

{'LR_cvec': [88.31, 79.59], 'Naive Bayes': [90.10000000000001, 80.07]}

# Decision tree 

In [52]:
#Instantiating the model
tree = DecisionTreeClassifier()

#Instantiating Count Vectorizer
cvec = CountVectorizer()

#Vectorizing and transforming our features matrix
X_train_cvec=cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

#Fitting the model
result_tree_cvec = tree.fit(X_train_cvec, y_train)

In [53]:
#Accuracy score training set
result_tree_cvec.score(X_train_cvec, y_train)

0.9880908532842234

In [27]:
#Accuracy score testing set
result_tree_cvec.score(X_test_cvec, y_test)

0.8876945885841364

In [54]:
#Updating scores dictionary
scores.update({'Decision Trees':[round(result_tree_cvec.score(X_train_cvec, y_train),4)*100, 
                              round(result_tree_cvec.score(X_test_cvec, y_test),4)*100]})

scores

{'LR_cvec': [88.31, 79.59],
 'Naive Bayes': [90.10000000000001, 80.07],
 'Decision Trees': [98.81, 72.49]}

## Naive bayes

In [55]:
#Updating scores dictionary
scores.update({'Naive Bayes':[round(result_mnb.score(X_train_mnb, y_train_mnb),4)*100, 
                              round(result_mnb.score(X_test_mnb, y_test_mnb),4)*100]})

scores

{'LR_cvec': [88.31, 79.59],
 'Naive Bayes': [90.10000000000001, 80.07],
 'Decision Trees': [98.81, 72.49]}

## Decision Trees

In [56]:
#Instantiating the model
tree = DecisionTreeClassifier()

#Instantiating Count Vectorizer
cvec = CountVectorizer()

#Vectorizing and transforming our features matrix
X_train_cvec=cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

#Fitting the model
result_tree_cvec = tree.fit(X_train_cvec, y_train)

In [57]:
#Accuracy score training set
result_tree_cvec.score(X_train_cvec, y_train)

0.9880908532842234

In [58]:
#Accuracy score testing set
result_tree_cvec.score(X_test_cvec, y_test)

0.7289134438305709

In [61]:
#Updating scores dictionary
scores.update({'Decision Trees':[round(result_tree_cvec.score(X_train_cvec, y_train),4)*100, 
                              round(result_tree_cvec.score(X_test_cvec, y_test),4)*100]})

scores

{'LR_cvec': [88.31, 79.59],
 'Naive Bayes': [90.10000000000001, 80.07],
 'Decision Trees': [98.81, 72.89]}

# random forest

In [62]:
#Instantiating the model
rf = RandomForestClassifier()

#Fitting the model
results_rf_cvec = rf.fit(X_train_cvec, y_train)

#Accuracy score training set
results_rf_cvec.score(X_train_cvec, y_train)

0.9880908532842234

In [63]:
#Accuracy score testing set
results_rf_cvec.score(X_test_cvec, y_test)

0.7900552486187845

In [64]:
#Setting a dictionary of parameters for a grid search
rf_params = { 'n_estimators': np.arange(20,200,10) }

#Instantiating a grid search with the parameters chosen above and a 5-fold cross-validation
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)

#Fitting our grid search 
gs.fit(X_train_cvec, y_train)

#Getting our best score for our best parameter
gs.best_score_

0.7970534069981584

In [67]:
#Getting our best estimator's training score
gs_best = gs.best_estimator_
gs_best.score(X_train_cvec,y_train)

0.9880908532842234

In [68]:
#Getting our best parameter
gs.best_params_

{'n_estimators': 110}