# Data cleanup

In [354]:
import pandas as pd
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [355]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Remove duplicates

In [356]:
len(train)

7613

In [357]:
train.drop_duplicates(subset=['keyword','location','text','target'], inplace=True)

In [358]:
len(train)

7561

## Remove hashtags from text

In [359]:
train['cleaned_text'] = train['text'].str.replace('#'," ").str.strip()
test['cleaned_text'] = test['text'].str.replace('#'," ").str.strip()

In [360]:
sum(train['text'].str.count('#'))

3359

In [361]:
sum(train['cleaned_text'].str.count('#'))

0

## Remove URLs, smileys and mentions (@) from text
Using Tweet Preprocessor: https://pypi.org/project/tweet-preprocessor/

In [362]:
import preprocessor

In [363]:
train['cleaned_text'] = train['cleaned_text'].apply(lambda x: preprocessor.clean(x))

In [364]:
test['cleaned_text'] = test['cleaned_text'].apply(lambda x: preprocessor.clean(x))

In [365]:
test['cleaned_text'] = test['cleaned_text'].apply(lambda x: re.sub(r'\s+',' ',x)) #remove whitespace repetition

In [366]:
train['cleaned_text'].iloc[200]

'Twelve feared killed in Pakistani air ambulance helicopter crash worldnews'

# Keyword cleanup

In [367]:
train.loc[~train['keyword'].isnull(), 'keyword'].unique()

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'derailed

In [368]:
train['keyword'] = train['keyword'].str.replace('%20', " ")

In [369]:
train.loc[~train['keyword'].isnull(), 'keyword'].unique()

array(['ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'desolate',

# Glove Twitter cleanup

In [371]:
import re
from preprocessor.defines import Patterns

# Different regex parts for smiley faces
eyes = "[8:=;]"
nose = "['`\-]?"

def split_hashtag(m):
    hashtag = m.group()
    hashtag_body = hashtag[1:]
    if hashtag_body.upper() == hashtag_body:
        result = f"<hashtag> {hashtag_body}"
    else:
        result = "<hashtag> "+(" ".join([] + re.split("(?=[A-Z])",hashtag_body))).strip()
    return result
            
def glove_cleanup(tweet):
    tweet = re.sub(Patterns.URL_PATTERN,'<url>',tweet)
    tweet = re.sub("/"," / ",tweet)
    tweet = re.sub(Patterns.MENTION_PATTERN,"<user>",tweet)
    tweet = re.sub(f"{eyes}{nose}[)d]+|[(d]+{nose}{eyes}","<smile>",tweet)
    tweet = re.sub(f"{eyes}{nose}p+", "<lolface>",tweet)
    tweet = re.sub(f"{eyes}{nose}\(+|\)+{nose}{eyes}", "<sadface>",tweet)
    tweet = re.sub(f"{eyes}{nose}[\/|l*]", "<neutralface>",tweet)
    tweet = re.sub("<3","<heart>",tweet)
    tweet = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*","<number>",tweet)
    tweet = re.sub("#\S+", split_hashtag, tweet)
    tweet = re.sub(r'([!?.]){2,}',r'\1 <repeat>', tweet)
    tweet = re.sub(r'\b(\S*?)(\S)\2{2,}\b', r'\1\2 <elong>',  tweet)
    tweet = re.sub(r'(\b[A-Z][A-Z]+\b)',r'\1 <allcaps>',tweet)
    tweet = re.sub(r'\s+',' ',tweet) #remove whitespace repetition
    return tweet.lower()

In [372]:
glove_cleanup("blah blah #test #testTest #TestTest #BLAH")

'blah blah <hashtag> test <hashtag> test test <hashtag> test test <hashtag> blah <allcaps>'

In [373]:
glove_cleanup('Start :( ): <3 blah/blag @bleg :-)) (-: :p 1234 2.7 blah 8-l test... elonggg BA AAb')

'start <sadface> <sadface> <heart> blah / blag <user> <smile> <smile> <lolface> <number> <number> blah <neutralface> test. <repeat> elong <elong> ba <allcaps> aab'

In [374]:
train['glove_cleaned_text'] = train['text'].apply(glove_cleanup)
test['glove_cleaned_text'] = test['text'].apply(glove_cleanup)

In [375]:
train[['text','glove_cleaned_text']].head()

Unnamed: 0,text,glove_cleaned_text
0,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this <hashtag> ear...
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask. canada
2,All residents asked to 'shelter in place' are ...,all residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or...",<number> people receive <hashtag> wildfires ev...
4,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby <hashtag> a...


# Saving data

In [376]:
test.to_csv('test_cleaned.csv', index=False)

In [377]:
train.to_csv('train_cleaned.csv', index=False)

In [378]:
train.iloc[99]

id                                                                  144
keyword                                                        accident
location                                                             UK
text                  .@NorwayMFA #Bahrain police had previously die...
target                                                                1
cleaned_text          . Bahrain police had previously died in a road...
glove_cleaned_text    .<user> <hashtag> bahrain police had previousl...
Name: 100, dtype: object