This pipeline predicts the classification of a given typical disaster emergency response text message. For example, if the input is: "My house is burning down", the classification category should be "Fire". This application is useful to identify emergency response services required.

In [18]:
import pandas as pd
import numpy as np
import re

In [37]:
import nltk
nltk.download(['stopwords','punkt'],download_dir='./')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to ./...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ./...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, \
    classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [42]:
stop_words = set(stopwords.words('english'))
# tokenizer = nltk.data.load("./tokenizers/punkt/english.pickle")

In [4]:
data_df = pd.read_csv('./message_classification_data.csv', index_col='Unnamed: 0')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
data_df.describe()

Unnamed: 0,id,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
count,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,...,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0,26113.0
mean,15231.433309,0.011833,0.413779,0.050741,0.0,0.015395,0.020182,0.045495,0.193352,0.093785,...,0.170298,0.027687,0.018037,0.08804,0.004595,0.093134,0.006089,0.045724,0.063723,0.278214
std,8828.502813,0.108137,0.492519,0.219473,0.0,0.123119,0.140624,0.20839,0.394934,0.291535,...,0.375902,0.164079,0.133088,0.283359,0.067635,0.290625,0.077795,0.208891,0.244264,0.448128
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7454.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15672.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,22933.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,30265.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
data_df.message.describe()

count      26113
unique     26110
top       #NAME?
freq           4
Name: message, dtype: object

In [7]:
data_df.columns

Index(['id', 'message', 'original', 'genre', 'aid_centers', 'aid_related',
       'buildings', 'child_alone', 'clothing', 'cold', 'death',
       'direct_report', 'earthquake', 'electricity', 'fire', 'floods', 'food',
       'hospitals', 'infrastructure_related', 'medical_help',
       'medical_products', 'military', 'missing_people', 'money', 'offer',
       'other_aid', 'other_infrastructure', 'other_weather', 'refugees',
       'related', 'request', 'search_and_rescue', 'security', 'shelter',
       'shops', 'storm', 'tools', 'transport', 'water', 'weather_related'],
      dtype='object')

In [51]:
X = data_df.iloc[:,1]
y = data_df.iloc[:,4:]

In [13]:
X.sample(5)

13230    The undersea quakes triggered enormous tidal w...
25726    Boko Haram staged its most deadly assault on t...
5536     Good evening, we are located in the Marc area,...
20531    Prices of vegetables, pork and chicken have be...
23479    We are trying to prevent such food rioting by ...
Name: message, dtype: object

In [14]:
y.sample(5)

Unnamed: 0,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,electricity,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
25099,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23591,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15202,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22866,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [40]:
non_alphabetic_characters_re =  re.compile(r'[^a-zA-Z0-9]',flags=re.M)

def tokenize(text):
    text = text.lower()
    text = re.sub(non_alphabetic_characters_re," ",text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [SnowballStemmer(language='english').stem(w) for w in words]
    return words

In [41]:
for i in range(5):
    print(X[i], " :: ", tokenize(X[i]))
    i += 1

Weather update - a cold front from Cuba that could pass over Haiti  ::  ['weather', 'updat', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti']
Is the Hurricane over or is it not over  ::  ['hurrican']
Looking for someone but no name  ::  ['look', 'someon', 'name']
UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.  ::  ['un', 'report', 'leogan', '80', '90', 'destroy', 'hospit', 'st', 'croix', 'function', 'need', 'suppli', 'desper']
says: west side of Haiti, rest of the country today and tonight  ::  ['say', 'west', 'side', 'haiti', 'rest', 'countri', 'today', 'tonight']


In [45]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=tokenize)), #create the CountVectorizer object
    ('tfidf', TfidfTransformer()), #create Tfidftransformer object    
    ('clf', MultiOutputClassifier(RandomForestClassifier(), n_jobs=-1)) #create the Classifier object
])


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [58]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f31c0758430>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(),
                                       n_jobs=-1))])

In [66]:
y_pred = pipeline.predict(X_test)

In [65]:
X_test.head()

12973    :( @AliVelshi: UPDATE: U.S. death toll from Su...
10708    i have some canned foods , cereals and s bit o...
19434    To save Ahmedabad and Rajkot TAX from overload...
16736    Hubei experienced its longest low temperature ...
22495    So far, one month rations of bulgar wheat, veg...
Name: message, dtype: object

In [69]:
y_test=pd.DataFrame(data=y_test,columns=y.columns)
y_pred=pd.DataFrame(data=y_pred,columns=y.columns,index=y_test.index)

In [70]:
y_test.head()

Unnamed: 0,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,electricity,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
12973,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
10708,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
19434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22495,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [71]:
y_pred.head()

Unnamed: 0,aid_centers,aid_related,buildings,child_alone,clothing,cold,death,direct_report,earthquake,electricity,...,request,search_and_rescue,security,shelter,shops,storm,tools,transport,water,weather_related
12973,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
10708,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22495,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
accuracy = (y_pred == y_test).mean()

print("Accuracy:\n",accuracy)


Accuracy:
 aid_centers               0.987555
aid_related               0.777905
buildings                 0.954241
child_alone               1.000000
clothing                  0.986215
cold                      0.980662
death                     0.962282
direct_report             0.852767
earthquake                0.971664
electricity               0.979131
fire                      0.990427
floods                    0.951752
food                      0.940647
hospitals                 0.989087
infrastructure_related    0.933372
medical_help              0.920735
medical_products          0.953284
military                  0.970706
missing_people            0.989661
money                     0.978173
offer                     0.995788
other_aid                 0.864254
other_infrastructure      0.956347
other_weather             0.943519
refugees                  0.968409
related                   0.817729
request                   0.892973
search_and_rescue         0.972621
security 

In [None]:
results_dic1={}
for column in y_pred.columns:
    print(column)
    print(classification_report(y_test[column], y_pred[column]))
    print('_____________________________________________________')

    results_dic1[column]=classification_report(y_test[column], y_pred[column],output_dict=True)['weighted avg']