In [1]:
import os
import sys

import pandas as pd
import re

import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

import nltk

stopwords = nltk.corpus.stopwords.words("english") + ["u", "im", "us", "r", "rt"]

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 160

In [3]:
data_bn = "data"
data_dir = os.path.abspath(
                        os.path.join(__name__, 
                        os.pardir, os.pardir, data_bn)
                        )

In [4]:
train_bn = "train.csv"
test_bn = "test.csv"
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

In [5]:
df_train = pd.read_csv(train_fn)
df_test = pd.read_csv(test_fn)

In [6]:
df_X = pd.concat([df_train, df_test])

In [7]:
df_train.head(30)
#iplot(fig)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [8]:
text = "text"
hashtag = "hashtag"
at = "at"
href = "href"

In [9]:
def to_lower(df):
    
    df[text] = df[text].apply(lambda x: x.casefold())
    return df


def hash_handling(df):
    '''
    '''
    reg_hash_full = re.compile("(#)\w+")
    reg_hash = re.compile("(#)")
    
    f = lambda x: [y.group() for y in reg_hash_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[hashtag] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_hash.sub(' ', x))
    
    return df

def at_handling(df):
    '''
    '''
    reg_at = re.compile("(@)")
    reg_at_full = re.compile("(@)\w+")
    
    f = lambda x: [y.group() for y in reg_at_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[at] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_at_full.sub(' ', x))
    
    return df


def href_handling(df):
    '''
    '''
    reg_href_full = re.compile("(htt)\S+")
    
    f = lambda x: len(list(reg_href_full.finditer(x)))
    
    df[href] = df[text].apply(f)
    df[text] = df[text].apply(lambda x: reg_href_full.sub(' ', x))
    
    return df


def html_special_handling(df):
    '''
    '''
    reg_html = re.compile("(&)\w+(;)")
    df[text] = df[text].apply(lambda x: reg_html.sub(' ', x))
    
    return df
    
    
def x89_byte_handling(df):
    '''
    '''
    reg_x89 = re.compile(b"\xc2\x89\S+")
    df[text] = df[text].apply(lambda x: reg_x89.sub(b' ', x.encode('utf-8')))
    
    return df
    
    
def special_char_handling(df):
    '''
    '''
    reg_special = re.compile(b"[^\w\s]")
    df[text] = df[text].apply(lambda x: reg_special.sub(b' ', x))
    df[text] = df[text].apply(lambda x: re.sub(b'_', b' ', x)) 
    
    return df


def contraction_handling(df):
    '''
    '''
    reg_contract = re.compile(b"\s(s|m|t|(nt)|(ve)|w)\s")
    df[text] = df[text].apply(lambda x: reg_contract.sub(b' ', x))
    
    return df


def encode_numerals(df):
    '''
    '''
    num = b"num"
    reg_numerals = re.compile(b"\d+\s*\d*")
    
    df[text] = df[text].apply(lambda x: reg_numerals.sub(num+b' ', x))
    
    return df
    
def remove_stopwords(df):
    '''
    '''
    f = lambda x: ' '.join([y for y in x.decode('utf-8').strip().split() if y not in stopwords])
    df[text] = df[text].apply(f)
    
    return df    


def preprocess(df):
    
    df = to_lower(df)

    df = hash_handling(df)
    df = at_handling(df)
    df = href_handling(df)
    df = html_special_handling(df)
    df = x89_byte_handling(df)
    df = special_char_handling(df)
    df = contraction_handling(df)
    df = encode_numerals(df)
    
    df = remove_stopwords(df)

    return df

In [10]:
df_p = preprocess(df_train)

TypeError: sequence item 1: expected a bytes-like object, str found

In [None]:
def tokenize_dataframe(df, max_len=20):
    '''
    '''
    df_tmp = pd.DataFrame(df[text].apply(lambda x: x.split()).tolist())
    df_tmp = df_tmp.rename(lambda x: text+"_{:02d}".format(x), axis=1)
    df = df.merge(df_tmp, how="outer", left_index=True, right_index=True)
    
    reg_text = re.compile(text+"_\d{2}")
    text_cols = [x for x in df.columns if reg_text.match(x)]
    
    return df, text_cols

In [None]:
df_train, text_cols = tokenize_dataframe(df_train)

In [None]:
import numpy as np
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):

    UNK = "UNK"

    def __init__(self):

        super().__init__()

    def fit(self, y):

        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)

    def transform(self, y):

        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)

    def fit_transform(self, y):

        self.fit(y)
        return self.transform(y)

In [None]:
enc = LabelEncoderExt()

In [None]:
df_train[text_cols] = df_train[text_cols].fillna('')

In [None]:
df_train[text_cols] = enc.fit_transform(df_train[text_cols].values.flatten()).reshape(df_train[text_cols].shape)

In [None]:
enc.classes_[:10]

In [None]:
df_train