# Exploration for Disaster Message Categorization

In [1]:
import pandas as pd
import nltk
import sqlite3


## Exploring Message Categories

In [2]:
cats_raw_df = pd.read_csv("../data/disaster_categories.csv", header=0, index_col='id')
cats_raw_df.sample(20)

Unnamed: 0_level_0,categories
id,Unnamed: 1_level_1
22020,related-1;request-0;offer-0;aid_related-0;medi...
5280,related-1;request-0;offer-0;aid_related-0;medi...
26493,related-1;request-0;offer-0;aid_related-1;medi...
27356,related-0;request-0;offer-0;aid_related-0;medi...
9868,related-0;request-0;offer-0;aid_related-0;medi...
25221,related-1;request-0;offer-0;aid_related-1;medi...
29960,related-1;request-0;offer-0;aid_related-1;medi...
24744,related-0;request-0;offer-0;aid_related-0;medi...
14528,related-2;request-0;offer-0;aid_related-0;medi...
782,related-1;request-1;offer-0;aid_related-1;medi...


In [31]:
def get_categories(df: pd.DataFrame):
    """
    get category names from 1st item in data frame. assumes categories appear in same order in every row

    :param df: data frame to sample
    :return: list of category name strings
    """
    tokens = df.iloc[0]['categories'].split(";")
    return [tok.split("-")[0] for tok in tokens]

def get_category_values(line: str):
    """
    Parse a message categories line, return values as list

    :param line: categories string for single disaster message
    :return: list of 1-hot encoded categories
    """
    tokens = line.split(";")
    return pd.Series([int(tok.split("-")[1]) for tok in tokens])

In [35]:
columns = get_categories(cats_raw_df)
test = cats_raw_df['categories'].apply(get_category_values)
test.columns = columns
test.sample(20)

Unnamed: 0_level_0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14487,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
58,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
26411,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23532,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15930,1,1,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
11287,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
19083,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15030,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
21014,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# number of valid entries
test.count()

related                   26248
request                   26248
offer                     26248
aid_related               26248
medical_help              26248
medical_products          26248
search_and_rescue         26248
security                  26248
military                  26248
child_alone               26248
water                     26248
food                      26248
shelter                   26248
clothing                  26248
money                     26248
missing_people            26248
refugees                  26248
death                     26248
other_aid                 26248
infrastructure_related    26248
transport                 26248
buildings                 26248
electricity               26248
tools                     26248
hospitals                 26248
shops                     26248
aid_centers               26248
other_infrastructure      26248
weather_related           26248
floods                    26248
storm                     26248
fire    

In [38]:
# number of messages with category
test.sum()

related                   20316
request                    4480
offer                       119
aid_related               10878
medical_help               2087
medical_products           1314
search_and_rescue           724
security                    471
military                    860
child_alone                   0
water                      1674
food                       2930
shelter                    2319
clothing                    406
money                       604
missing_people              299
refugees                    876
death                      1196
other_aid                  3448
infrastructure_related     1705
transport                  1203
buildings                  1335
electricity                 534
tools                       159
hospitals                   283
shops                       120
aid_centers                 309
other_infrastructure       1151
weather_related            7304
floods                     2158
storm                      2448
fire    

In [39]:
# fraction of messages with category
test.sum() / test.count()

related                   0.774002
request                   0.170680
offer                     0.004534
aid_related               0.414432
medical_help              0.079511
medical_products          0.050061
search_and_rescue         0.027583
security                  0.017944
military                  0.032764
child_alone               0.000000
water                     0.063776
food                      0.111628
shelter                   0.088350
clothing                  0.015468
money                     0.023011
missing_people            0.011391
refugees                  0.033374
death                     0.045565
other_aid                 0.131362
infrastructure_related    0.064957
transport                 0.045832
buildings                 0.050861
electricity               0.020344
tools                     0.006058
hospitals                 0.010782
shops                     0.004572
aid_centers               0.011772
other_infrastructure      0.043851
weather_related     

## Exploring messages

In [40]:
msg_raw = pd.read_csv("../data/disaster_messages.csv", header=0, index_col='id')
msg_raw.sample(20)

Unnamed: 0_level_0,message,original,genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
937,i have a problem talking to people in port au ...,Mwen gen you poblem pou m pale ak moun mwen yo...,direct
28082,He said regular sorties were being carried out...,,news
4023,I don't have monies to go give blood in Port-a...,"Mwen pa gen kob, pou m ale bay san Port au pri...",direct
19547,Large swathes of India are reeling under a sco...,,news
14237,We will not survive in this chilling winter an...,mar jain gey is shadeed sardi mein. main aur m...,direct
7370,we don't understend the message which sent by ...,Mwen pa konprann mesaj la ONG voye a.,direct
29296,"With the emergency phase over, the Russian Red...",,news
22344,The profiles of these and other victims in the...,,news
2920,"142, Ruelle Beaulieu, Mon Repos 44 Carrefour","142, ruelle beaulieu mon repos 44 carrefour",direct
25475,The Strategic Climate Fund will support pilot ...,,news


In [41]:
# distribution of message genres
msg_raw['genre'].value_counts()

news      13068
direct    10782
social     2398
Name: genre, dtype: int64

In [42]:
# fraction of messages that have an original in a different language
msg_raw['original'].count() / len(msg_raw)

0.3879914660164584

In [45]:
# message counts by genre
msg_raw.groupby('genre').count()

Unnamed: 0_level_0,message,original
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
direct,10782,10184
news,13068,0
social,2398,0


False

In [54]:
# some messages have NOTES
noted_msg = msg_raw.loc[msg_raw['message'].str.contains('NOTES')]
noted_msg

Unnamed: 0_level_0,message,original,genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3539,NOTES: Personal. Not an emergency.,"prezidan preval mande,pou tout moun ki konn bw...",direct
3579,NOTES: aclet dry We are counting on you.,aclet dry. nous comptons sur vous.,direct
3583,NOTES: talking about gas prices,GEN SERI DE MESAJ FOK NOU TA BIEN PRECIZE YO L...,direct
3651,NOTES: Historic earthquake in Haiti.,"istorik tranblemannte an ayiti svp,",direct
3737,NOTES: They are requesting that somebody to pa...,Mesaj avek la otorite sil vou ple mezanmi pase...,direct
...,...,...,...
11161,NOTES: I already translated the message is not...,Je vous salut dans le nom de Dieu merci a la r...,direct
11167,NOTES: this messsage doesn't concern this job,metenan je te salye nan nom jesus ti na pa ape...,direct
11168,NOTES: This text is not clear. it doesn't has ...,BEGIN:VCARD N:W TEL:999 END:VCARD,direct
11171,"NOTES: I allready translate this message, it's...",Mw se rolain mw ta renmen konnen Eske pa gen k...,direct


In [55]:
# get noted indices for dropping
noted_msg.index

Int64Index([ 3539,  3579,  3583,  3651,  3737,  3761,  3775,  3820,  3871,
             3877,
            ...
            11100, 11109, 11110, 11138, 11143, 11161, 11167, 11168, 11171,
            11172],
           dtype='int64', name='id', length=331)

In [56]:
# drop noted messages
cleaned_msg = msg_raw.drop(noted_msg.index, axis=0)
cleaned_msg

Unnamed: 0_level_0,message,original,genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct
...,...,...,...
30261,The training demonstrated how to enhance micro...,,news
30262,A suitable candidate has been selected and OCH...,,news
30263,"Proshika, operating in Cox's Bazar municipalit...",,news
30264,"Some 2,000 women protesting against the conduc...",,news
