In [1]:
import os
import sys

import pandas as pd
import re

import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
data_bn = "data"
data_dir = os.path.abspath(
                        os.path.join(__name__, 
                        os.pardir, os.pardir, data_bn)
                        )

In [4]:
train_bn = "train.csv"
test_bn = "test.csv"
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

In [5]:
df_train = pd.read_csv(train_fn)

In [6]:
df_train.head()
#iplot(fig)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
df_train['text'][2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [8]:
text = "text"
hashtag = "hashtag"
at = "at"
href = "href"

In [9]:
def to_lower(df):
    
    df[text] = df[text].apply(lambda x: x.casefold())
    return df


def hash_handling(df):
    '''
    '''
    reg_hash_full = re.compile("(#)\w+")
    reg_hash = re.compile("(#)")
    
    f = lambda x: [y.group() for y in reg_hash_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[hashtag] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_hash.sub('', x))
    
    return df

def at_handling(df):
    '''
    '''
    reg_at = re.compile("(@)")
    reg_at_full = re.compile("(@)\w+")
    
    f = lambda x: [y.group() for y in reg_at_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[at] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_at_full.sub(' ', x))
    
    return df


def href_handling(df):
    '''
    '''
    reg_href_full = re.compile("(htt)\S+")
    
    f = lambda x: len(list(reg_href_full.finditer(x)))
    
    df[href] = df[text].apply(f)
    df[text] = df[text].apply(lambda x: reg_href_full.sub(' ', x))
    
    return df


def html_special_handling(df):
    '''
    '''
    reg_html = re.compile("(&)\w+(;)")
    df[text] = df[text].apply(lambda x: reg_html.sub(' ', x))
    
    return df
    
    
def x89_byte_handling(df):
    '''
    '''
    reg_x89 = re.compile(b"\xc2\x89\S+")
    df[text] = df[text].apply(lambda x: reg_x89.sub(b' ', x.encode('utf-8')))
    
    return df
    
    
def special_char_handling(df):
    '''
    '''
    reg_special = re.compile(b"[^\w\s]")
    df[text] = df[text].apply(lambda x: reg_special.sub(b' ', x))
    
    return df


def contraction_handling(df):
    '''
    '''
    reg_contract = re.compile(b"\s[sm(nt)]\s")
    df[text] = df[text].apply(lambda x: reg_contract.sub(b' ', x))
    
    return df
    

def preprocess(df):
    
    df = to_lower(df)
    df = hash_handling(df)
    df = at_handling(df)
    df = href_handling(df)
    df = html_special_handling(df)
    df = x89_byte_handling(df)
    df = special_char_handling(df)
    df = contraction_handling(df)
    
    return df

In [10]:
df_p = preprocess(df_train)

In [11]:
df_p

Unnamed: 0,id,keyword,location,text,target,hashtag,at,href
0,1,,,b'our deeds are the reason of this earthquake ...,1,#earthquake,,0
1,4,,,b'forest fire near la ronge sask canada',1,,,0
2,5,,,b'all residents asked to shelter in place ar...,1,,,0
3,6,,,b'13 000 people receive wildfires evacuation o...,1,#wildfires,,0
4,7,,,b'just got sent this photo from ruby alaska as...,1,#alaska #wildfires,,0
5,8,,,b'rockyfire update california hwy 20 close...,1,#rockyfire #cafire #wildfires,,0
6,10,,,b'flood disaster heavy rain causes flash flood...,1,#flood #disaster,,0
7,13,,,b'i on top of the hill and i can see a fire in...,1,,,0
8,14,,,b'there an emergency evacuation happening now ...,1,,,0
9,15,,,b'i afraid that the tornado is coming to our a...,1,,,0
