This pipeline predicts the classification of a given typical disaster emergency response text message. For example, if the input is: "My house is burning down", the classification category should be "Fire". This application is useful to identify emergency response services required.

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
import nltk
nltk.download(['stopwords','punkt'],download_dir='./')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to ./...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ./...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, \
    classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

In [5]:
stop_words = set(stopwords.words('english'))
# tokenizer = nltk.data.load("./tokenizers/punkt/english.pickle")

In [7]:
data_df = pd.read_csv('./data.csv', index_col='Unnamed: 0')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
data_df.describe()

Unnamed: 0,id,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
count,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,...,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0
mean,15231.433309,0.011833,0.413779,0.050741,0.0,0.015395,0.020182,0.045495,0.193352,0.093785,...,0.170298,0.027687,0.018037,0.08804,0.004595,0.093134,0.006089,0.045724,0.063723,0.278214
std,8828.502813,0.108137,0.492519,0.219473,0.0,0.123119,0.140624,0.20839,0.394934,0.291535,...,0.375902,0.164079,0.133088,0.283359,0.067635,0.290625,0.077795,0.208891,0.244264,0.448128
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7454.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15672.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,22933.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,30265.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
data_df = data_df[data_df.message != '#NAME?']

In [12]:
data_df.message.describe()

count                              26109
unique                             26109
top       Earthquake'''THE NEW HAITI''' 
freq                                   1
Name: message, dtype: object

In [13]:
data_df.columns

Index(['id', 'message', 'original', 'genre', 'aid_centers', 'aid_related',
       'buildings', 'child_alone', 'clothing', 'cold', 'death',
       'direct_report', 'earthquake', 'electricity', 'fire', 'floods', 'food',
       'hospitals', 'infrastructure_related', 'medical_help',
       'medical_products', 'military', 'missing_people', 'money', 'offer',
       'other_aid', 'other_infrastructure', 'other_weather', 'refugees',
       'related', 'request', 'search_and_rescue', 'security', 'shelter',
       'shops', 'storm', 'tools', 'transport', 'water', 'weather_related'],
      dtype='object')

In [14]:
X = data_df.iloc[:,1] # take the message as input
y = data_df.iloc[:,4:] # rest of the columns are one-hot encoded output

In [15]:
X.sample(5)

16741    The island is regularly buffeted by the region...
16832    Road conditions are expected to be a substanti...
11972    http://twitpic.com/18uv8k - [Pics before the e...
22202    The unregistered refugees in Pakistan helped b...
20169    As a result, prices of cereals, particularly f...
Name: message, dtype: object

In [26]:
for col in y.columns:
    print(f'{col} :: {y[col].mean()}')
# this shows that the data is skewed and whenever child_alone == 1, the model wont perform as expected

aid_centers :: 0.011834999425485465
aid_related :: 0.4138036692328316
buildings :: 0.05074878394423379
child_alone :: 0.0
clothing :: 0.015396989543835459
cold :: 0.020184610670649968
death :: 0.045501551189245086
direct_report :: 0.1933432915852771
earthquake :: 0.09379907311654985
electricity :: 0.020299513577693516
fire :: 0.010800873262093532
floods :: 0.08192577272204987
food :: 0.11122601401815466
hospitals :: 0.010839174231108048
infrastructure_related :: 0.06514994829369183
medical_help :: 0.07943620973610632
medical_products :: 0.05009766747098702
military :: 0.03282393044544027
missing_people :: 0.011375387797311272
money :: 0.02305718334673867
offer :: 0.0044429124056838635
other_aid :: 0.13144892565781915
other_infrastructure :: 0.04393121145964993
other_weather :: 0.05258723045693056
refugees :: 0.03339844498065801
related :: 0.7730284576199777
request :: 0.17032440920755296
search_and_rescue :: 0.027691600597495116
security :: 0.01803975640583707
shelter :: 0.088053927764

In [28]:
y.isna().any()

aid_centers               False
aid_related               False
buildings                 False
child_alone               False
clothing                  False
cold                      False
death                     False
direct_report             False
earthquake                False
electricity               False
fire                      False
floods                    False
food                      False
hospitals                 False
infrastructure_related    False
medical_help              False
medical_products          False
military                  False
missing_people            False
money                     False
offer                     False
other_aid                 False
other_infrastructure      False
other_weather             False
refugees                  False
related                   False
request                   False
search_and_rescue         False
security                  False
shelter                   False
shops                     False
storm   

In [29]:
X.isna().any()

False

In [32]:
non_alphabetic_characters_re =  re.compile(r'[^a-zA-Z0-9]',flags=re.M)
url_re = re.compile(r'^https?:\/\/.*[\r\n]*',flags=re.M)

def tokenize(text):
    text = text.lower()
    text = re.sub(non_alphabetic_characters_re,"",text)
    text = re.sub(url_re,"",text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [SnowballStemmer(language='english').stem(w) for w in words]
    return words

In [49]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=tokenize)), #create the CountVectorizer object
    ('tfidf', TfidfTransformer()), #create Tfidftransformer object    
    ('clf', MultiOutputClassifier(GridSearchCV(RandomForestClassifier(),param_grid={'n_estimators':[50,100]},cv=2,
                                               n_jobs=-1))) #create the Classifier object
])
# note: gridsearchcv defaults to cv of 5

In [50]:
scores = -1 * cross_val_score(pipeline, X, y,
                              cv=2,
                              scoring='neg_mean_absolute_error')

KeyboardInterrupt: 

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [51]:
model = pipeline.fit(X_train, y_train)

In [56]:
model.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f4ef90a2040>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=GridSearchCV(cv=2,
                                                              estimator=RandomForestClassifier(),
                                                              n_jobs=-1,
                                                              param_grid={'n_estimators': [50,
                                                                                           100]})))])>

In [54]:
import pickle
pickle.dump(model, open('./model.pb', 'wb'))

In [57]:
y_pred = pipeline.predict(X_test)

In [36]:
print(scores)

[0.07242436 0.10216749]


In [65]:
X_test.head()

12973    :( @AliVelshi: UPDATE: U.S. death toll from Su...
10708    i have some canned foods , cereals and s bit o...
19434    To save Ahmedabad and Rajkot TAX from overload...
16736    Hubei experienced its longest low temperature ...
22495    So far, one month rations of bulgar wheat, veg...
Name: message, dtype: object

In [58]:
y_test=pd.DataFrame(data=y_test,columns=y.columns)
y_pred=pd.DataFrame(data=y_pred,columns=y.columns,index=y_test.index)

In [59]:
y_test.head()

Unnamed: 0,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,electricity,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
9683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1598,0,1,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
3351,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
11734,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
y_pred.head()

Unnamed: 0,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,electricity,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
12973,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
10708,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22495,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
accuracy = (y_pred == y_test).mean().tolist()
import statistics
accuracy = statistics.mean(accuracy)
print("Accuracy:\n",accuracy)


Accuracy:
 0.9060917485850462


In [61]:
results_dic1={}
for column in y_pred.columns:
    print(column)
    print(classification_report(y_test[column], y_pred[column]))
    print('_____________________________________________________')

    results_dic1[column]=classification_report(y_test[column], y_pred[column],output_dict=True)['weighted avg']

aid_centers
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5164
           1       0.00      0.00      0.00        58

    accuracy                           0.99      5222
   macro avg       0.49      0.50      0.50      5222
weighted avg       0.98      0.99      0.98      5222

_____________________________________________________
aid_related
              precision    recall  f1-score   support

           0       0.59      1.00      0.74      3066
           1       0.67      0.00      0.00      2156

    accuracy                           0.59      5222
   macro avg       0.63      0.50      0.37      5222
weighted avg       0.62      0.59      0.44      5222

_____________________________________________________
buildings
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      4964
           1       0.00      0.00      0.00       258

    accuracy                           0

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94      4664
           1       0.00      0.00      0.00       558

    accuracy                           0.89      5222
   macro avg       0.45      0.50      0.47      5222
weighted avg       0.80      0.89      0.84      5222

_____________________________________________________
hospitals
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5171
           1       0.00      0.00      0.00        51

    accuracy                           0.99      5222
   macro avg       0.50      0.50      0.50      5222
weighted avg       0.98      0.99      0.99      5222

_____________________________________________________
infrastructure_related
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4900
           1       0.00      0.00      0.00       322

    accuracy                           0.