In [1]:
## import statements
import pandas as pd
import os
import re
import string
import traceback
import dask.dataframe as dd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
def get_conf():
    conf = {
        "path": "/Users/jaydeepchakraborty/JC/git-projects/model_util/",
        "data":{
            "train_data_path": "DataSets/NLPwithDisasterTweets/train.csv",
            "modf_train_data_path": "DataSets/NLPwithDisasterTweets/modf_train_data.csv",
            "test_data_path": "DataSets/NLPwithDisasterTweets/test.csv",
            "modf_test_data_path": "DataSets/NLPwithDisasterTweets/modf_test_data.csv",
            "custom_stop_words": "DataSets/NLPwithDisasterTweets/custom_stop_word.txt"
        }
    }
    
    return conf

In [3]:
def read_data(conf, ind="train"):
    
    if ind == "train":
        df = pd.read_csv(conf["path"] + conf["data"]["train_data_path"])
        df = df.astype({"id": 'int64', "keyword": 'string', "location": 'string', "text": 'string', "target": 'int64'})
    elif ind == "test":
        df = pd.read_csv(conf["path"] + conf["data"]["test_data_path"])
        df = df.astype({"id": 'int64', "keyword": 'string', "location": 'string', "text": 'string'})
    return df

In [4]:
# text cleaning utils

## Removing Contractions: it's -> it is
def decontracted(phrase):

    # Specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # ..

    # General
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    # ..

    return phrase

## Removing Punctuations: , . ! ? : ;..
def remove_punctuations(text):
    for punctuation in list(string.punctuation): text = text.replace(punctuation, '')
    return text

## Removing Numbers
def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    text = re.sub(r'[0-9]', '', text)
    return text

## Removing Whitespaces
def clean_whitespace(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

## Removing Repeating Words
def clean_repeat_words(text):
    return re.sub(r"(\w*)(\w)\2(\w*)", r"\1\2\3", text)

## Removing custom words
def clean_cust_word(phrase):
    phrase = re.sub(r"#", "", phrase)
    phrase = re.sub(r"√", "", phrase)
    
    return phrase

def clean_data_util(text):
    text  = text.lower()
    text = decontracted(text)
    text = remove_punctuations(text)
    text = clean_number(text)
    text = clean_whitespace(text)
    text = clean_repeat_words(text)
    text = clean_cust_word(text)
    
    return text

def filter_data_util(word_tokens):
    filterd_words = [w for w in word_tokens if len(w)>2]
    return filterd_words

In [5]:
def modf_func(row, final_stop_words, conf):
    
    # for Lemmatization
    lemmatizer = WordNetLemmatizer()
    
    # keyword column
    word_txt =  row['keyword'] if isinstance(row['keyword'], str) else ""
    word_txt = clean_data_util(word_txt)
    word_tokens = word_tokenize(word_txt)
    word_lemma = [lemmatizer.lemmatize(w) for w in word_tokens]
    filtered_words = [w for w in word_lemma if not w in final_stop_words]
    filtered_words = filter_data_util(filtered_words)
    clean_text = ' '.join(filtered_words)
    row['keyword'] = clean_text
    
    # location column
    word_txt =  row['location'] if isinstance(row['location'], str) else ""
    word_txt = clean_data_util(word_txt)
    word_tokens = word_tokenize(word_txt)
    word_lemma = [lemmatizer.lemmatize(w) for w in word_tokens]
    filtered_words = [w for w in word_lemma if not w in final_stop_words]
    filtered_words = filter_data_util(filtered_words)
    clean_text = ' '.join(filtered_words)
    row['location'] = clean_text
    
    # text column
    word_txt =  row['text'] if isinstance(row['text'], str) else ""
    word_txt = clean_data_util(word_txt)
    word_tokens = word_tokenize(word_txt)
    word_lemma = [lemmatizer.lemmatize(w) for w in word_tokens]
    filtered_words = [w for w in word_lemma if not w in final_stop_words]
    filtered_words = filter_data_util(filtered_words)
    clean_text = ' '.join(filtered_words)
    row['text'] = clean_text
    
    return row

In [6]:
# stop words
def load_custom_stop_words():
    stop_words = set(stopwords.words('english'))
    custom_stop_words_path = conf['path'] + conf['data']['custom_stop_words']
    
    if os.path.isfile(custom_stop_words_path): 
        with open(custom_stop_words_path) as f:
            custom_stop_words = f.read().splitlines()
            custom_stop_words_set = set(custom_stop_words)
            stop_words = stop_words.union(custom_stop_words_set)
            
    return stop_words

In [7]:
def modf_data(df, conf, ind="train"):
    
    final_stop_words = load_custom_stop_words()
    
    if ind == "train":
        data_meta = {'id': 'int64', 'keyword': 'string', 'location': 'string', 'text': 'string', 'target': 'int64'}
        ddf = dd.from_pandas(df, npartitions=6) # find your own number of partitions
        ddf = ddf.apply(modf_func, meta=data_meta, args=(final_stop_words, conf,), axis=1).compute()

        # df = df.apply(modf_func, axis=1)

        # removing the rows where text-column are blank
        ddf = ddf[ddf['text'] != '']
    elif ind == "test":
        data_meta = {'id': 'int64', 'keyword': 'string', 'location': 'string', 'text': 'string'}
        ddf = dd.from_pandas(df, npartitions=6) # find your own number of partitions
        ddf = ddf.apply(modf_func, meta=data_meta, args=(final_stop_words, conf,), axis=1).compute()
        
    return ddf

In [8]:
def save_data(df, conf, ind="train"):
    if ind == "train":
        df.to_csv(conf["path"] + conf['data']['modf_train_data_path'], index=False,)
    elif ind == "test":
        df.to_csv(conf["path"] + conf['data']['modf_test_data_path'], index=False,)

In [11]:
%%time
if __name__ == "__main__":
    try:
        conf = get_conf()
        
        # modifying the training data
        train_data_df = read_data(conf, ind="train")
        train_modf_df = modf_data(train_data_df, conf, ind="train")
        save_data(train_modf_df, conf, ind="train")
        
        # modifying the testing data
        test_data_df = read_data(conf, ind="test")
        test_modf_df = modf_data(test_data_df, conf, ind="test")
        save_data(test_modf_df, conf, ind="test")
        
    except Exception as e:
        traceback.print_exc()

CPU times: user 9.22 s, sys: 161 ms, total: 9.38 s
Wall time: 9.49 s
