In [24]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import csv

In [3]:
data_dir='~/data/crisis/ecuador_earthquake_2016/'

In [19]:
path = os.path.join(data_dir,'conversations.csv' )
conv = pd.read_csv(path, parse_dates=['timestamp'],dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
conv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150308 entries, 0 to 150307
Data columns (total 13 columns):
id                       150308 non-null object
screen_name              150308 non-null object
latitude                 387 non-null float64
longitude                387 non-null float64
lang                     150308 non-null object
in_reply_to_status_id    78242 non-null object
text                     150308 non-null object
timestamp                150293 non-null datetime64[ns]
conversation_id          150308 non-null object
conversation_deep        150308 non-null int64
num_replies              150308 non-null int64
num_users                150308 non-null int64
url                      150308 non-null object
dtypes: datetime64[ns](1), float64(2), int64(3), object(7)
memory usage: 14.9+ MB


## split conversations by language

In [4]:
roots = conv[conv.in_reply_to_status_id.isnull()]
roots.shape

(72066, 13)

In [5]:
conv_ids = roots[roots.lang=='en'].conversation_id
conv_en =  conv[conv.conversation_id.isin(conv_ids)]
path = os.path.join(data_dir,'en','conversations.csv' )
conv_en.to_csv(path, index=False)
conv_en.shape

(39520, 13)

In [7]:
conv_ids = roots[roots.lang=='es'].conversation_id
conv_es = conv[conv.conversation_id.isin(conv_ids)]
path = os.path.join(data_dir,'es','conversations.csv' )
conv_es.to_csv('../data/ecuador_earthquake_2016/es/conversations.csv', index=False)
conv_es.shape

(94776, 13)

In [13]:
conv_ids = roots[~roots.lang.isin(['en','es'])].conversation_id
conv_others = conv[conv.conversation_id.isin(conv_ids)]
conv_others.to_csv('../data/ecuador_earthquake_2016/other/conversations.csv', index=False)
conv_others.shape

(16012, 13)

# convert annotated dialogs to crisisNLP format

In [245]:
path = os.path.join(data_dir,'en','dialogs_annotated.csv' )
dialogs = pd.read_csv(path, parse_dates=['timestamp'],dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
dialogs.shape, dialogs.columns

((1007, 61), Index(['id', 'screen_name', 'latitude', 'longitude', 'lang',
        'in_reply_to_status_id', 'text', 'conversation_id', 'conversation_deep',
        'num_replies', 'num_users', 'url', 'timestamp', 'dialog_id', 'turns',
        'crisis_related', 'outcome_prevention_ack',
        'outcome_situational_awareness', 'outcome_relief_coordination',
        'people_deaths', 'people_wounded', 'people_missing', 'people_evacuated',
        'people_other', 'infra_buildings', 'infra_roads', 'infra_houses',
        'infra_business', 'infra_other', 'request_info', 'request_goods',
        'request_services', 'request_other', 'offer_info', 'offer_goods',
        'offer_services', 'offer_other', 'informative', 'update', 'precaution',
        'emotional', 'expressive_positive', 'expressive_negative', 'complain',
        'suggest_action', 'promise', 'sarcasm', 'insult', 'yes_no_question',
        'wh_question', 'open_question', 'yes_answer', 'no_answer',
        'response_ack', 'response_oth

In [246]:
dialogs[dialogs.in_reply_to_status_id.isnull()].crisis_related.value_counts()

1.0    93
0.0    64
Name: crisis_related, dtype: int64

In [191]:
tweets  =dialogs.drop_duplicates('id').copy()
tweets.shape

(695, 61)

In [192]:
tweets[tweets.people_deaths==1].shape

(22, 61)

In [193]:
def convert_to_8figure_labels(tweets, dialogs):
    tweets['choose_one_category'] = 'na'

    crt= dialogs[dialogs.in_reply_to_status_id.isnull()&
                 dialogs.crisis_related==1].conversation_id.unique()
    #crt.shape
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.people_deaths ==1) |
                (tweets.people_wounded==1)
            ), 'choose_one_category']= 'dead'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.people_missing ==1) |
                (tweets.people_other==1)
            ), 'choose_one_category']= 'missing'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.people_evacuated==1) 
            ), 'choose_one_category']= 'evacuated'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.infra_buildings==1) |
                (tweets.infra_roads==1) |
                (tweets.infra_houses==1) |
                (tweets.infra_business==1) |
                (tweets.infra_other==1) 

            ), 'choose_one_category']= 'infra'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.request_info==1) | (tweets.request_goods==1) |
                (tweets.request_services==1) | (tweets.request_other==1) |
                (tweets.offer_info==1) | (tweets.offer_goods==1) |
                (tweets.offer_services==1) | (tweets.offer_other==1) |
                (tweets.complain==1) | (tweets.promise==1)
            ), 'choose_one_category']= 'help'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.precaution==1) 
            ), 'choose_one_category']= 'caution'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.emotional==1) |
                (tweets.expressive_positive==1) |
                (tweets.expressive_negative ==1) 
            ), 'choose_one_category']= 'support'
    tweets.loc[tweets.conversation_id.isin(crt) & 
            (
                (tweets.informative==1) |
                (tweets.update==1) |
                (tweets.other_subcat ==1)
            ), 'choose_one_category']= 'other'
    print(tweets.choose_one_category.value_counts())
    return tweets

In [227]:
path = os.path.join(data_dir,'en','conversations.csv' )
conversations = pd.read_csv(path, parse_dates=['timestamp'],dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
conversations.shape, dialogs.columns

((39520, 13), Index(['id', 'screen_name', 'latitude', 'longitude', 'lang',
        'in_reply_to_status_id', 'text', 'timestamp', 'conversation_id',
        'conversation_deep', 'num_replies', 'num_users', 'url', 'dialog_id',
        'turns', 'crisis_related', 'outcome_prevention_ack',
        'outcome_situational_awareness', 'outcome_relief_coordination',
        'people_deaths', 'people_wounded', 'people_missing', 'people_evacuated',
        'people_other', 'infra_buildings', 'infra_roads', 'infra_houses',
        'infra_business', 'infra_other', 'request_info', 'request_goods',
        'request_services', 'request_other', 'offer_info', 'offer_goods',
        'offer_services', 'offer_other', 'informative', 'update', 'precaution',
        'emotional', 'expressive_positive', 'expressive_negative', 'complain',
        'suggest_action', 'promise', 'sarcasm', 'insult', 'yes_no_question',
        'wh_question', 'open_question', 'yes_answer', 'no_answer',
        'response_ack', 'response_ot

In [228]:
conversations[conversations.text.str.contains('earthquake')].shape

(13799, 13)

In [153]:
dialogs[dialogs.text.str.contains('earthquake')].shape

(105, 61)

In [235]:
eq_convs = conversations[conversations.text.str.contains('earthquake') & conversations.in_reply_to_status_id.isnull()]
eq_convs.shape

(13348, 13)

In [236]:
eq_convs = conversations[~conversations.text.str.contains('earthquake',case=False)]
eq_convs.shape

(19451, 13)

In [237]:
path = os.path.join(data_dir,'2016_ecuador_eq_en.csv' )
tweets_annotated = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_annotated.shape

(2695, 7)

In [238]:
eq_convs = eq_convs[~eq_convs.id.isin(tweets_annotated.id)]
eq_convs.shape

(18841, 13)

In [239]:
eq_convs_sample = eq_convs.sample(1500, random_state=1) 
eq_convs_sample.shape

(1500, 13)

In [240]:
path = os.path.join(data_dir,'en','tweets_na.csv' )
eq_convs_sample.to_csv(path, index=False, quoting=csv.QUOTE_ALL)

In [22]:
eq_convs_sample = eq_convs.sample(2000, random_state=1) 
eq_convs_sample.shape

(2000, 13)

In [28]:
eq_convs_sample[eq_convs_sample.text.str.startswith('RT')].shape

(24, 13)

In [25]:
path = os.path.join(data_dir,'en','conversations_sample.csv' )
eq_convs_sample.to_csv(path, index=False, quoting=csv.QUOTE_ALL)

In [241]:
path = os.path.join(data_dir,'en','tweets_annotated.csv' )
tweets_annotated = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_annotated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 6 columns):
id                     2000 non-null object
screen_name            2000 non-null object
text                   2000 non-null object
url                    2000 non-null object
timestamp              2000 non-null datetime64[ns]
choose_one_category    2000 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 93.8+ KB


In [242]:
tweets_annotated.columns

Index(['id', 'screen_name', 'text', 'url', 'timestamp', 'choose_one_category'], dtype='object')

In [243]:
tweets_annotated.choose_one_category.value_counts()

dead         1151
other         365
support       222
help          141
infra          68
caution        36
na              7
evacuated       7
missing         3
Name: choose_one_category, dtype: int64

In [247]:
tweets = dialogs.drop_duplicates('id').copy()
tweets.shape

(695, 61)

In [248]:
tweets = convert_to_8figure_labels(tweets, dialogs)

na         439
support    103
help        68
other       62
dead        14
infra        5
caution      3
missing      1
Name: choose_one_category, dtype: int64


In [249]:
cols=['id', 'screen_name', 'text', 'url', 'timestamp', 'choose_one_category']
tweets = tweets[cols]
tweets.shape

(695, 6)

In [250]:
path = os.path.join(data_dir,'en','tweets_na_annotated.csv' )
tweets_na_annotated = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_na_annotated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 6 columns):
id                     1500 non-null object
screen_name            1500 non-null object
text                   1499 non-null object
url                    1500 non-null object
timestamp              1500 non-null datetime64[ns]
choose_one_category    1500 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 70.4+ KB


In [252]:
tweets8 = pd.concat([tweets_annotated, tweets, tweets_na_annotated])
tweets8.shape

(4195, 6)

In [253]:
tweets8.loc[tweets8.choose_one_category=='dead','choose_one_category'] = 'injured_or_dead_people'
tweets8.loc[tweets8.choose_one_category=='missing','choose_one_category'] = 'missing_trapped_or_found_people'
tweets8.loc[tweets8.choose_one_category=='evacuated','choose_one_category'] = 'displaced_people_and_evacuations'
tweets8.loc[tweets8.choose_one_category=='infra','choose_one_category'] = 'infrastructure_and_utilities_damage'
tweets8.loc[tweets8.choose_one_category=='help','choose_one_category'] = 'donation_needs_or_offers_or_volunteering_services'
tweets8.loc[tweets8.choose_one_category=='caution','choose_one_category'] = 'caution_and_advice'
tweets8.loc[tweets8.choose_one_category=='support','choose_one_category'] = 'sympathy_and_emotional_support'
tweets8.loc[tweets8.choose_one_category=='other','choose_one_category'] = 'other_useful_information'
tweets8.loc[tweets8.choose_one_category=='na','choose_one_category'] = 'not_related_or_irrelevant'

In [254]:
tweets8['crisis_related'] = 'no'

ix = (tweets8.choose_one_category!='not_related_or_irrelevant')

tweets8.loc[ix,'crisis_related'] = 'yes'

In [255]:
tweets8.crisis_related.value_counts()

yes    2249
no     1946
Name: crisis_related, dtype: int64

In [257]:
tweets8.shape

(4195, 7)

In [256]:
path = os.path.join(data_dir,'en','2016_ecuador_eq_en.csv' )
tweets8.to_csv(path, index=False)

# annotations spanish

In [205]:
path = os.path.join(data_dir,'es','dialogs_annotated.csv' )
dialogs = pd.read_csv(path, parse_dates=['timestamp'],dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
dialogs.shape, dialogs.columns

((3572, 61), Index(['id', 'screen_name', 'latitude', 'longitude', 'lang',
        'in_reply_to_status_id', 'text', 'timestamp', 'conversation_id',
        'conversation_deep', 'num_replies', 'num_users', 'url', 'dialog_id',
        'turns', 'crisis_related', 'outcome_prevention_ack',
        'outcome_situational_awareness', 'outcome_relief_coordination',
        'people_deaths', 'people_wounded', 'people_missing', 'people_evacuated',
        'people_other', 'infra_buildings', 'infra_roads', 'infra_houses',
        'infra_business', 'infra_other', 'request_info', 'request_goods',
        'request_services', 'request_other', 'offer_info', 'offer_goods',
        'offer_services', 'offer_other', 'informative', 'update', 'precaution',
        'emotional', 'expressive_positive', 'expressive_negative', 'complain',
        'suggest_action', 'promise', 'sarcasm', 'insult', 'yes_no_question',
        'wh_question', 'open_question', 'yes_answer', 'no_answer',
        'response_ack', 'response_oth

In [206]:
path = os.path.join(data_dir,'es','conversations.csv' )
conversations = pd.read_csv(path, parse_dates=['timestamp'],dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
conversations.shape, dialogs.columns

((94776, 13), Index(['id', 'screen_name', 'latitude', 'longitude', 'lang',
        'in_reply_to_status_id', 'text', 'timestamp', 'conversation_id',
        'conversation_deep', 'num_replies', 'num_users', 'url', 'dialog_id',
        'turns', 'crisis_related', 'outcome_prevention_ack',
        'outcome_situational_awareness', 'outcome_relief_coordination',
        'people_deaths', 'people_wounded', 'people_missing', 'people_evacuated',
        'people_other', 'infra_buildings', 'infra_roads', 'infra_houses',
        'infra_business', 'infra_other', 'request_info', 'request_goods',
        'request_services', 'request_other', 'offer_info', 'offer_goods',
        'offer_services', 'offer_other', 'informative', 'update', 'precaution',
        'emotional', 'expressive_positive', 'expressive_negative', 'complain',
        'suggest_action', 'promise', 'sarcasm', 'insult', 'yes_no_question',
        'wh_question', 'open_question', 'yes_answer', 'no_answer',
        'response_ack', 'response_ot

In [109]:
eq_convs = conversations[conversations.text.str.contains('terremoto') & conversations.in_reply_to_status_id.isnull()]
eq_convs.shape

(29530, 13)

In [112]:
eq_convs = eq_convs.drop_duplicates('text')
eq_convs.shape

(28963, 13)

In [207]:
path = os.path.join(data_dir,'es','tweets_annotated.csv' )
tweets_annotated = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_annotated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2014 entries, 0 to 2013
Data columns (total 6 columns):
id                     2014 non-null object
screen_name            2014 non-null object
text                   2014 non-null object
url                    2014 non-null object
timestamp              2013 non-null datetime64[ns]
choose_one_category    2014 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 94.5+ KB


In [208]:
tweets_annotated = tweets_annotated.drop_duplicates('id')
tweets_annotated.shape

(2014, 6)

In [209]:
tweets_annotated = tweets_annotated.drop_duplicates('text')
tweets_annotated.shape

(2007, 6)

In [137]:
eq_convs = eq_convs[~eq_convs.id.isin(tweets_annotated.id)]
eq_convs.shape

(27438, 13)

In [122]:
eq_convs_sample = eq_convs.sample(1000, random_state=1) 
eq_convs_sample.shape

(1000, 13)

In [123]:
path = os.path.join(data_dir,'es','tweets_sample.csv' )
eq_convs_sample.to_csv(path, index=False, quoting=csv.QUOTE_ALL)

In [210]:
tweets = dialogs.drop_duplicates('id').copy()
tweets.shape

(2193, 61)

In [211]:
tweets = convert_to_8figure_labels(tweets, dialogs)


na         1680
other       304
support     132
help         64
infra         6
caution       4
dead          3
Name: choose_one_category, dtype: int64


In [212]:
cols=['id', 'screen_name', 'text', 'url', 'timestamp', 'choose_one_category']
tweets = tweets[cols]
tweets.shape

(2193, 6)

In [213]:
tweets8 = pd.concat([tweets_annotated, tweets])
tweets8.shape

(4200, 6)

In [214]:
tweets8.choose_one_category.value_counts()

na           1862
other         767
dead          624
support       452
help          235
infra         157
caution        61
missing        30
evacuated      12
Name: choose_one_category, dtype: int64

In [215]:
tweets8.loc[tweets8.choose_one_category=='dead','choose_one_category'] = 'injured_or_dead_people'
tweets8.loc[tweets8.choose_one_category=='missing','choose_one_category'] = 'missing_trapped_or_found_people'
tweets8.loc[tweets8.choose_one_category=='evacuated','choose_one_category'] = 'displaced_people_and_evacuations'
tweets8.loc[tweets8.choose_one_category=='infra','choose_one_category'] = 'infrastructure_and_utilities_damage'
tweets8.loc[tweets8.choose_one_category=='help','choose_one_category'] = 'donation_needs_or_offers_or_volunteering_services'
tweets8.loc[tweets8.choose_one_category=='caution','choose_one_category'] = 'caution_and_advice'
tweets8.loc[tweets8.choose_one_category=='support','choose_one_category'] = 'sympathy_and_emotional_support'
tweets8.loc[tweets8.choose_one_category=='other','choose_one_category'] = 'other_useful_information'
tweets8.loc[tweets8.choose_one_category=='na','choose_one_category'] = 'not_related_or_irrelevant'

In [216]:
tweets8['crisis_related'] = 'no'

ix = (tweets8.choose_one_category!='not_related_or_irrelevant')

tweets8.loc[ix,'crisis_related'] = 'yes'

In [224]:
tweets8.crisis_related.value_counts()

yes    2338
no     1862
Name: crisis_related, dtype: int64

In [217]:
path = os.path.join(data_dir,'es','2016_ecuador_eq_es.csv' )
tweets8.to_csv(path, index=False)

## merge languages

In [220]:
path = os.path.join(data_dir,'2016_ecuador_eq_es.csv' )
tweets_es = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_es.shape

(4200, 7)

In [221]:
path = os.path.join(data_dir,'2016_ecuador_eq_en.csv' )
tweets_en = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_en.shape

(2695, 7)

In [222]:
tweets_all = pd.concat([tweets_es, tweets_en])
tweets_all.shape

(6895, 7)

In [223]:
path = os.path.join(data_dir,'2016_ecuador_eq.csv' )
tweets_all.to_csv(path, index=False)

## create annotators

In [333]:
path = os.path.join(data_dir,'2016_ecuador_eq_es.csv' )
tweets_es = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets_es.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168 entries, 0 to 4167
Data columns (total 10 columns):
id                        4168 non-null object
screen_name               4168 non-null object
text                      4167 non-null object
url                       4168 non-null object
timestamp                 4167 non-null datetime64[ns]
choose_one_category       4168 non-null object
crisis_related            4168 non-null object
choose_one_category_a1    4168 non-null object
choose_one_category_a2    4168 non-null object
choose_one_category_a3    4168 non-null object
dtypes: datetime64[ns](1), object(9)
memory usage: 325.7+ KB


In [334]:
tweets_es.drop_duplicates('id', inplace=True)
tweets_es.shape

(4168, 10)

In [335]:
#tweets_es.to_csv(path, index=False)

In [342]:
tweets_es['choose_one_category_a1'] = tweets_es.choose_one_category
tweets_es['choose_one_category_a2'] = tweets_es.choose_one_category
tweets_es['choose_one_category_a3'] = tweets_es.choose_one_category

In [343]:
def set_noise(tweets, field, num, seed):
    noise_tweets = tweets.sample(num, random_state=seed).copy()
    noise_tweets.loc[noise_tweets.choose_one_category=='other_useful_information',field]='not_related_or_irrelevant'
    noise_tweets.loc[noise_tweets.choose_one_category=='not_related_or_irrelevant',field]='other_useful_information'
    noise_tweets.loc[noise_tweets.choose_one_category=='injured_or_dead_people',field]='missing_trapped_or_found_people'
    noise_tweets.loc[noise_tweets.choose_one_category=='sympathy_and_emotional_support',field]='caution_and_advice'
    noise_tweets.loc[noise_tweets.choose_one_category=='caution_and_advice',field]='other_useful_information'
    noise_tweets.loc[noise_tweets.choose_one_category=='displaced_people_and_evacuations',field]='other_useful_information'
    noise_tweets.loc[noise_tweets.choose_one_category=='donation_needs_or_offers_or_volunteering_services',field]='infrastructure_and_utilities_damage'
    noise_tweets.loc[noise_tweets.choose_one_category=='infrastructure_and_utilities_damage',field]='injured_or_dead_people'
    noise_tweets.loc[noise_tweets.choose_one_category=='missing_trapped_or_found_people',field]='displaced_people_and_evacuations'
    return noise_tweets


In [344]:
noise_tweets=set_noise(tweets_es, 'choose_one_category_a1', 800, 1)
tweets_bak = tweets_es[~tweets_es.id.isin(noise_tweets.id)]
tweets_bak.shape, noise_tweets.shape

((3368, 10), (800, 10))

In [345]:
noise_tweets2=set_noise(tweets_bak, 'choose_one_category_a2', 500, 2)
tweets_bak = tweets_bak[~tweets_bak.id.isin(noise_tweets2.id)]
tweets_bak.shape, noise_tweets2.shape

((2868, 10), (500, 10))

In [346]:
tweet_all = pd.concat([tweets_bak, noise_tweets,noise_tweets2], sort=False)
tweet_all.shape

(4168, 10)

In [347]:
tweet_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4168 entries, 0 to 3241
Data columns (total 10 columns):
id                        4168 non-null object
screen_name               4168 non-null object
text                      4167 non-null object
url                       4168 non-null object
timestamp                 4167 non-null datetime64[ns]
choose_one_category       4168 non-null object
crisis_related            4168 non-null object
choose_one_category_a1    4168 non-null object
choose_one_category_a2    4168 non-null object
choose_one_category_a3    4168 non-null object
dtypes: datetime64[ns](1), object(9)
memory usage: 358.2+ KB


In [348]:
tweet_all.to_csv(path, index=False)

In [372]:
path = os.path.join(data_dir,'2016_ecuador_eq_en.csv' )
tweets = pd.read_csv(path, parse_dates=['timestamp'],
    dtype={'id':object, 'conversation_id':object,'in_reply_to_status_id':object})
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4192 entries, 0 to 4191
Data columns (total 10 columns):
id                        4192 non-null object
screen_name               4192 non-null object
text                      4191 non-null object
url                       4192 non-null object
timestamp                 4192 non-null datetime64[ns]
choose_one_category       4192 non-null object
crisis_related            4192 non-null object
choose_one_category_a1    4192 non-null object
choose_one_category_a2    4192 non-null object
choose_one_category_a3    4192 non-null object
dtypes: datetime64[ns](1), object(9)
memory usage: 327.6+ KB


In [373]:
tweets.drop_duplicates('id', inplace=True)
tweets.shape
#tweets.id.nunique()

(4192, 10)

In [374]:
tweets['choose_one_category_a1'] = tweets.choose_one_category
tweets['choose_one_category_a2'] = tweets.choose_one_category
tweets['choose_one_category_a3'] = tweets.choose_one_category

In [375]:
noise_tweets=set_noise(tweets, 'choose_one_category_a1', 900, 1)
tweets_bak = tweets[~tweets.id.isin(noise_tweets.id)]
tweets_bak.shape, noise_tweets.shape

((3292, 10), (900, 10))

In [376]:
noise_tweets2=set_noise(tweets_bak, 'choose_one_category_a2', 600, 2)
tweets_bak = tweets_bak[~tweets_bak.id.isin(noise_tweets2.id)]
tweets_bak.shape, noise_tweets2.shape

((2692, 10), (600, 10))

In [377]:
tweet_all = pd.concat([tweets_bak, noise_tweets,noise_tweets2], sort=False)
tweet_all.shape

(4192, 10)

In [378]:
tweet_all.to_csv(path, index=False)

In [379]:
tweet_all.choose_one_category.unique()

array(['injured_or_dead_people', 'other_useful_information',
       'sympathy_and_emotional_support',
       'infrastructure_and_utilities_damage',
       'donation_needs_or_offers_or_volunteering_services',
       'caution_and_advice', 'displaced_people_and_evacuations',
       'missing_trapped_or_found_people', 'not_related_or_irrelevant'],
      dtype=object)