In [None]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import joblib
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.sequence import pad_sequences
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from nltk.stem.snowball import SnowballStemmer
tqdm.pandas()

In [None]:
def removeURL(data):
    print('remove URL')
    data = data.apply(lambda x : re.sub(r'http\S+','',x))
    return data

def removePUNC(data): 
    '''
    remove punctuation in the sentences
    '''
    print('remove punctuation')
    punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~,.'
    def rm(x):
        for ch in set(punctuation):
            if ch in x:
                x.remove(ch)
        return x
    data = data.progress_apply(lambda x:rm(x))
    return data
def removeHashtag(data):
    print('remove hashtag')
    data = data.apply(lambda x: x.split('#')[0])
    return data

def removeWP(data):
    print('remove white space')
    data = data.apply(lambda x : ' '.join(x.split()))
    return data
def removeLH(data):
    print('remove <LH')
    data=data.apply(lambda x: x.replace('<LH>',''))
    return data
    
def lemma(data):
    print('lemmatize')
    '''
    動詞原形
    '''
    # nlp=spacy.load('en', disable = ['parser', 'ner'])  #spacy's language model
    nlp = spacy.load('en_core_web_sm')
    # output =[]
    def lamm(sent):
        output = []
        s = [token.lemma_ for token in nlp(sent)]
        output= ' '.join(s)
        return output
    data = data.progress_apply(lambda x:lamm(x))
    # for sent in tqdm(data):
    return data 
    
           
def stemm(data):
    '''
    詞態全部統一 不論名詞動詞
    lemma 結果比 NLP stemming 乾淨(s有去掉)
    '''
    
    stemmer = SnowballStemmer(language='english')
    output =[]
    for sent in data:
        s = stemmer.stem(sent) 
        output.append(' '.join(s))
    return output

def tokenizer(data):
    tknzr = TweetTokenizer()
    data = data.apply(tknzr.tokenize)
    return data

def preprocess(data):
    print('preprocessing')
    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'percent', 'money', 
                   'phone', 'user', 'time', 'url', 
                   'date', 'number'],
        # terms that will be annotated
        annotate={"hashtag", "allcaps", 
                  "elongated", "repeated",
                  'emphasis', 'censored'},
        fix_html=True,  # fix HTML tokens
        
        # corpus from which the word statistics are going to be used 
        # for word segmentation 
        segmenter="twitter", 
        
        # corpus from which the word statistics are going to be used 
        # for spell correction
        corrector="twitter", 
        
        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words
        
        # select a tokenizer. You can use SocialTokenizer, or pass your own
        # the tokenizer, should take as input a string and return a list of tokens
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        
        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons]
    )
    data = data.progress_apply(text_processor.pre_process_doc)
    return data

def FirstClean(data):
    print('cleaning...')
    replace_pair = [('”', '"'),
                    ('“', '"'),
                    ('’', '\''),
                    ('´', '\''),
                    ('\n', ' '),
                    ('\t', ' '),
                    ('...', ' '),
                    ('--', ' -- '),
                    
                    ('\'m ', ' am '),
                    ('n\'t ', ' not '),
                    ('\'ve ', ' have '),
                    ('\'re ', ' are '),
                    ('\'ll ', ' will '),
                    ('\'d ', ' would '),
                    
                    (' canot ', ' can not '),
                    (' cannot ', ' can not '),
                    (' ca not ', ' can not '),
                    (' dint ', ' did not '), 
                    (' ur ', ' you are '),
                    (' tbh ', ' to be honest '),
                    (' & ', ' and '),
                    (' u ', ' you '),
                    (' r ', ' are '),
                    ('fuckin', 'fucking'),
                    
                    ('<LH>', ' ')]
    data = data.apply(lambda x: re.sub('wtf+', ' what the fuck ', x).strip())
    for pair in replace_pair:
        data = data.apply(lambda x: x.replace(*pair))       
    return data
