# Data cleanup

In [1]:
import pandas as pd
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Remove duplicates

In [3]:
len(train)

7613

In [4]:
train.drop_duplicates(subset=['keyword','location','text','target'], inplace=True)

In [5]:
len(train)

7561

## Remove hashtags from text

In [6]:
train['cleaned_text'] = train['text'].str.replace('#'," ").str.strip()
test['cleaned_text'] = test['text'].str.replace('#'," ").str.strip()

In [7]:
sum(train['text'].str.count('#'))

3359

In [8]:
sum(train['cleaned_text'].str.count('#'))

0

## Remove URLs, smileys and mentions (@) from text
Using Tweet Preprocessor: https://pypi.org/project/tweet-preprocessor/

In [9]:
import preprocessor

In [10]:
train['cleaned_text'] = train['cleaned_text'].apply(lambda x: preprocessor.clean(x))

In [11]:
test['cleaned_text'] = test['cleaned_text'].apply(lambda x: preprocessor.clean(x))

In [12]:
train['cleaned_text'].iloc[200]

'Twelve feared killed in Pakistani air ambulance helicopter crash worldnews'

# Keyword cleanup

In [13]:
train.loc[~train['keyword'].isnull(), 'keyword'].unique()

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'derailed

In [14]:
train['keyword'] = train['keyword'].str.replace('%20', " ")

In [15]:
train.loc[~train['keyword'].isnull(), 'keyword'].unique()

array(['ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'desolate',

# Saving data

In [16]:
test.to_csv('test_cleaned.csv', index=False)

In [17]:
train.to_csv('train_cleaned.csv', index=False)

In [18]:
train.iloc[99]

id                                                            144
keyword                                                  accident
location                                                       UK
text            .@NorwayMFA #Bahrain police had previously die...
target                                                          1
cleaned_text    . Bahrain police had previously died in a road...
Name: 100, dtype: object