In [2]:
import os
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from cleantext import clean
from tqdm.auto import tqdm
from textblob import TextBlob
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/dknguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dknguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def remove_non_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9<>\s]', '', text)

In [5]:
def clean_text(text):
    text = clean(text,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
        no_urls=False,                  # replace all URLs with a special token
        no_emails=False,                # replace all email addresses with a special token
        no_phone_numbers=False,         # replace all phone numbers with a special token
        no_numbers=False,               # replace all numbers with a special token
        no_digits=False,                # replace all digits with a special token
        no_currency_symbols=False,      # replace all currency symbols with a special token
        no_punct=True,                 # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en"                       # set to 'de' for German special handling
    )
    return remove_non_alphanumeric(text)

In [6]:
train_df = pd.read_csv("../data/raw_train.csv")
test_df = pd.read_csv("../data/test.csv")

In [7]:
train_df.drop(labels=["length"], axis=1, inplace=True)
test_df.drop(labels=["length"], axis=1, inplace=True)

In [8]:
train_df

Unnamed: 0,tweet,label
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1
2499996,<user> ff too thank youuu ) ),1
2499997,i just love shumpa ! that's my girl,1
2499998,the best way to start a day ! no matter what h...,1


In [9]:
test_df

Unnamed: 0,tweet
0,sea doo pro sea scooter ( sports with the port...
1,<user> shucks well i work all week so now i ca...
2,i cant stay away from bug thats my baby
3,<user> no ma'am ! ! ! lol im perfectly fine an...
4,"whenever i fall asleep watching the tv , i alw..."
...,...
9995,had a nice time w / my friend lastnite
9996,<user> no it's not ! please stop !
9997,not without my daughter ( dvd two-time oscar (...
9998,<user> have fun in class sweetcheeks


Remove duplications and tweets that are longer than 140 characters

In [10]:
train_df.drop_duplicates(subset="tweet", inplace=True)
train_df = train_df[train_df["tweet"].str.len() <= 140]
train_df

Unnamed: 0,tweet,label
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0
...,...,...
2499995,a warning sign ? (; rt <user> the negativity y...,1
2499996,<user> ff too thank youuu ) ),1
2499997,i just love shumpa ! that's my girl,1
2499998,the best way to start a day ! no matter what h...,1


For the test set, since we cannot drop anything, we just truncate the tweets

In [11]:
def truncate_tweet(t):
    return t[:140]

In [12]:
test_df["tweet"] = test_df["tweet"].apply(truncate_tweet)

A general function for transforming the original data

In [13]:
def transform(df, funcs):
    df_ = df.copy(deep=True)
    tweets = df_["tweet"]

    for f in funcs:
        tweets = Parallel(n_jobs=32)(delayed(f)(t) for t in tqdm(tweets))
    
    df_["tweet"] = tweets
    return df_

In [None]:
stop_words = [
    'i',
    'me',
    'my',
    'myself',
    'we',
    'our',
    'ours',
    'ourselves',
    'you',
    'your',
    'yours',
    'yourself',
    'yourselves',
    'he',
    'him',
    'his',
    'himself',
    'she',
    'her',
    'hers',
    'herself',
    'it',
    'its',
    'itself',
    'they',
    'them',
    'their',
    'theirs',
    'themselves',
    'what',
    'which',
    'who',
    'whom',
    'this',
    'that',
    'these',
    'those',
    'am',
    'is',
    'are',
    'was',
    'were',
    'be',
    'been',
    'being',
    'have',
    'has',
    'had',
    'having',
    'do',
    'does',
    'did',
    'doing',
    'a',
    'an',
    'the',
    'and',
    'but',
    'if',
    'or',
    'because',
    'as',
    'until',
    'while',
    'of',
    'at',
    'by',
    'for',
    'with',
    'about',
    'against',
    'between',
    'into',
    'through',
    'during',
    'before',
    'after',
    'above',
    'below',
    'to',
    'from',
    'up',
    'down',
    'in',
    'out',
    'on',
    'off',
    'over',
    'under',
    'again',
    'further',
    'then',
    'once',
    'here',
    'there',
    'when',
    'where',
    'why',
    'how',
    'all',
    'any',
    'both',
    'each',
    'few',
    'more',
    'most',
    'other',
    'some',
    'such',
    'no',
    'nor',
    'not',
    'only',
    'own',
    'same',
    'so',
    'than',
    'too',
    'very',
    'can',
    'will',
    'just',
    'don',
    'should',
    'now'
]

In [None]:
from nltk.corpus import brown
from nltk import FreqDist
frequency_list = FreqDist(i.lower() for i in brown.words())
common_english_words = []
for word, freq in frequency_list.most_common()[:2000]:
    if len(word) > 1:
        common_english_words.append(word)

def find_hashtags_and_split_words(text):
    chunks = text.split(" ")
    result = []
    for chunk in chunks:
        if len(chunk) == 0:
            continue
        if chunk[0] == "#":
            result.append("#")
            current_word = ""
            for c in chunk[1:]:
                current_word += c
                if current_word in stop_words or current_word in common_english_words:
                    result.append(current_word)
                    current_word = ""
            if current_word != "":
                result.append(current_word)
        else:
            result.append(chunk)
    return " ".join(result)

In [None]:
find_hashtags_and_split_words("#shouldhavereaditwithaglassofwine")

In [None]:
hashtag_train_df = transform(train_df, [find_hashtags_and_split_words])

In [None]:
hashtag_test_df = transform(test_df, [find_hashtags_and_split_words])

In [None]:
hashtag_train_df.to_csv("../data/hashtag_train.csv", index=False)
hashtag_test_df.to_csv("../data/hashtag_test.csv", index=False)

Clean raw tweets using clean-text library

In [14]:
cleaned_train_df = transform(train_df, [clean_text])

  0%|          | 0/2260464 [00:00<?, ?it/s]

100%|██████████| 2260464/2260464 [00:30<00:00, 75254.06it/s] 


In [15]:
cleaned_test_df = transform(test_df, [clean_text])

100%|██████████| 10000/10000 [00:00<00:00, 32049.42it/s]


In [16]:
cleaned_train_df.to_csv("../data/cleaned_train.csv", index=False)
cleaned_test_df.to_csv("../data/cleaned_test.csv", index=False)

### Create variants of the cleaned_train.csv

#### Lemmatization

In [17]:
def lemmatize_text(text):
    blob = TextBlob(text)
    lemmatized_words = [word.lemmatize() for word in blob.words]
    lemmatized_words = ["<"+word+">" if word in ["user", "url"] else word for word in lemmatized_words]
    return ' '.join(lemmatized_words)

In [18]:
s = "<user>"
lemmatize_text(s)

'<user>'

In [19]:
cleaned_lemmatized_train_df = transform(train_df, [clean_text, lemmatize_text])

  1%|          | 12224/2260464 [00:00<00:49, 45277.25it/s]

100%|██████████| 2260464/2260464 [00:24<00:00, 93745.85it/s] 
100%|██████████| 2260464/2260464 [00:46<00:00, 48834.55it/s]


In [20]:
cleaned_lemmatized_test_df = transform(test_df, [clean_text, lemmatize_text])

100%|██████████| 10000/10000 [00:00<00:00, 32227.78it/s]
100%|██████████| 10000/10000 [00:00<00:00, 34356.14it/s]


In [21]:
cleaned_lemmatized_train_df.to_csv("../data/cleaned_lemmatized_train.csv", index=False)
cleaned_lemmatized_test_df.to_csv("../data/cleaned_lemmatized_test.csv", index=False)

#### Remove stop words

In [33]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split(" ") 
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [34]:
remove_stopwords("<user> went to the <url> and see")

'<user> went <url> see'

In [35]:
cleaned_stopword_removed_train_df = transform(train_df, [clean_text, remove_stopwords])



100%|██████████| 2260464/2260464 [00:28<00:00, 78500.24it/s] 
100%|██████████| 2260464/2260464 [00:21<00:00, 104912.73it/s]


In [36]:
cleaned_stopword_removed_test_df = transform(test_df, [clean_text, remove_stopwords])

100%|██████████| 10000/10000 [00:00<00:00, 32125.32it/s]
100%|██████████| 10000/10000 [00:00<00:00, 42180.56it/s]


In [37]:
cleaned_stopword_removed_train_df.to_csv("../data/cleaned_stopword_removed_train.csv", index=False)
cleaned_stopword_removed_test_df.to_csv("../data/cleaned_stopword_removed_test.csv", index=False)

#### Spelling correction

In [26]:
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_tweet = blob.correct()
    return corrected_tweet

In [38]:
correct_spelling("<user> wnet to hoem after the patry")

TextBlob("<user> went to home after the party")

In [39]:
# cleaned_spelling_correction_train_df = transform(train_df, [clean_text, correct_spelling])



100%|██████████| 2260464/2260464 [00:28<00:00, 80388.56it/s]


In [None]:
# cleaned_spelling_correction_test_df = transform(test_df, [clean_text, correct_spelling])

In [None]:
# cleaned_spelling_correction_train_df.to_csv("../data/cleaned_spelling_corrected_train.csv", index=False)
# cleaned_spelling_correction_test_df.to_csv("../data/cleaned_spelling_corrected_test.csv", index=False)