In [1]:
import string
import modin.pandas as pd
import pyap
import nltk
import emoji
import re
import tabulate
import statistics
import textstat
import time
# from tqdm import tqdm
# from modin.config import ProgressBar
# ProgressBar.enable()

start_time = time.time()

nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')

stopwords = set(nltk.corpus.stopwords.words('english'))
words = set(nltk.corpus.words.words())

df = pd.read_csv("data/train.csv", encoding="utf-8")
df.columns = ['0', "id", "datetime", "query", "username", "text"]

cleaned_df = df.copy()
stats_column = ["WordCount", "SentenceCount", "AvgSentenceLen", "MaxSentenceLen", "MinSentenceLen", "MaxWordLen",
                "Emoticons", "StopWords", "NumLowerCase", "NumSpecialChars", "Address", "PhoneNum", "AccountNum"]

[nltk_data] Downloading package words to
[nltk_data]     /Users/khantzawhein/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/khantzawhein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/khantzawhein/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2024-12-28 01:59:41,130	INFO worker.py:1821 -- Started a local Ray instance.


In [2]:
tokenizer = nltk.TweetTokenizer()

def find_addresses(text):
    return [address for address in pyap.parse(text, country='US')]


def find_emoticons_and_emojis(text):
    emojis = [i['emoji'] for i in emoji.emoji_list(text)]
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    return emojis + emoticons


def find_phone_numbers(text):
    return re.findall(r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$', text)


def find_account_numbers(text):
    return re.findall(r'\b\d{9,18}\b', text)


def find_special_chars(text):
    return re.findall(r'[^\w\s]', text)


def find_lowercase(text):
    return re.findall(r'[a-z]', text)


def find_stop_words(text):
    return [char for char in text if char in stopwords]


def word_tokenize(text):
    # Tweet mentions and hashtags aware tokenizer
    tokens = tokenizer.tokenize(text)
    return [token for token in tokens if token not in string.punctuation]


def sentence_tokenize(text):
    return nltk.sent_tokenize(text)


In [3]:
before_stat_start_time = time.time()

before_stats_for_rows = pd.DataFrame(columns=stats_column)
before_sentences = df["text"].apply(lambda x: sentence_tokenize(x))
before_words = df["text"].apply(lambda x: word_tokenize(x))

before_stats_for_rows["WordCount"] = before_words.apply(len)
before_stats_for_rows["SentenceCount"] = before_sentences.apply(len)
before_stats_for_rows["AvgSentenceLen"] = before_sentences.apply(
    lambda x: statistics.mean([len(sentence) for sentence in x]))
before_stats_for_rows["MaxSentenceLen"] = before_sentences.apply(lambda x: max([len(sentence) for sentence in x]))
before_stats_for_rows["MinSentenceLen"] = before_sentences.apply(lambda x: min([len(sentence) for sentence in x]))
before_stats_for_rows["MaxWordLen"] = before_words.apply(lambda x: max([len(word) for word in x]))
before_stats_for_rows["Emoticons"] = df["text"].apply(lambda x: len(find_emoticons_and_emojis(x)))
before_stats_for_rows["StopWords"] = df["text"].apply(lambda x: len(find_stop_words(x)))
before_stats_for_rows["NumLowerCase"] = df["text"].apply(lambda x: len(find_lowercase(x)))
before_stats_for_rows["NumSpecialChars"] = df["text"].apply(lambda x: len(find_special_chars(x)))
before_stats_for_rows["Address"] = df["text"].apply(lambda x: len(find_addresses(x)))
before_stats_for_rows["PhoneNum"] = df["text"].apply(lambda x: len(find_phone_numbers(x)))
before_stats_for_rows["AccountNum"] = df["text"].apply(lambda x: len(find_account_numbers(x)))

print(before_stats_for_rows.head(5))

aggregated_before_stats = before_stats_for_rows.agg(['sum', 'mean', 'max', 'min'])
print(aggregated_before_stats)

before_stat_end_time = time.time()


   WordCount  SentenceCount  AvgSentenceLen  MaxSentenceLen  MinSentenceLen  \
0         22              2           55.00             105               5   
1         18              2           44.00              46              42   
2         10              1           46.00              46              46   
3         21              4           26.75              46               8   
4          5              1           28.00              28              28   

   MaxWordLen  Emoticons  StopWords  NumLowerCase  NumSpecialChars  Address  \
0           8          0         45            81                6        0   
1           9          0         31            62                3        0   
2           5          0         18            37                0        0   
3          16          0         39            80                9        0   
4           9          0          8            22                1        0   

   PhoneNum  AccountNum  
0         0           0 

## Data Cleaning

In [4]:
def remove_stop_words(text):
    return " ".join([word for word in text.split() if word not in stopwords])


def remove_special_chars(text):
    return re.sub(r'[^\w\s]', '', text)


def remove_numbers(text):
    return re.sub(r'\d+', '', text)


def remove_punctuation(text):
    return "".join([char for char in text if char not in string.punctuation])


def lemmaize_tokens(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(word, pos="v") for word in tokens]


def remove_non_english_words(tokens):
    return [token for token in tokens if token in words]


def remove_emoticons(text):
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    return emoji.replace_emoji(text, "")


def remove_phone_numbers(text):
    return re.sub(r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$', '', text)


def remove_account_numbers(text):
    return re.sub(r'\b\d{9,18}\b', '', text)


In [5]:
def clean_text(text):
    text = text.lower()
    # pipeline for cleaning text
    text = remove_stop_words(text)
    text = remove_special_chars(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_emoticons(text)
    text = remove_phone_numbers(text)
    text = remove_account_numbers(text)
    tokens = word_tokenize(text)
    tokens = remove_non_english_words(tokens)
    tokens = lemmaize_tokens(tokens)
    text = " ".join(tokens).strip()

    return text if text else None


In [6]:
cleaning_start_time = time.time()


cleaned_df["text"] = df["text"].apply(lambda x: clean_text(x))

cleaned_df.dropna(subset=["text"], inplace=True)

# Clean text using NLTK:

print(cleaned_df.head(5))

cleaning_end_time = time.time()

   0          id                      datetime     query       username  \
0  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY  scotthamilton   
1  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY       mattycus   
2  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY        ElleCTF   
3  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         Karoli   
4  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY       joy_wolf   

                                                text  
0  upset cant update it might cry result school t...  
1                        many time ball save rest go  
2                         whole body itchy like fire  
3                     no all mad here cant see there  
4                                         whole crew  


In [7]:
with open("data/cleaned_train.csv", "w") as file:
    cleaned_df.to_csv(file, index=False)

Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


In [8]:
#After Stats
after_stats_start_time = time.time()

after_stats_for_rows = pd.DataFrame(columns=stats_column)
after_sentences = cleaned_df["text"].apply(lambda x: sentence_tokenize(x))
after_words = cleaned_df["text"].apply(lambda x: word_tokenize(x))

after_stats_for_rows["WordCount"] = after_words.apply(len)
after_stats_for_rows["SentenceCount"] = after_sentences.apply(len)
after_stats_for_rows["AvgSentenceLen"] = after_sentences.apply(
    lambda x: statistics.mean([len(sentence) for sentence in x]) if x else 0)
after_stats_for_rows["MaxSentenceLen"] = after_sentences.apply(
    lambda x: max([len(sentence) for sentence in x]) if x else 0)
after_stats_for_rows["MinSentenceLen"] = after_sentences.apply(
    lambda x: min([len(sentence) for sentence in x]) if x else 0)
after_stats_for_rows["MaxWordLen"] = after_words.apply(lambda x: max([len(word) for word in x]) if x else 0)
after_stats_for_rows["Emoticons"] = cleaned_df["text"].apply(lambda x: len(find_emoticons_and_emojis(x)))
after_stats_for_rows["StopWords"] = cleaned_df["text"].apply(lambda x: len(find_stop_words(x)))
after_stats_for_rows["NumLowerCase"] = cleaned_df["text"].apply(lambda x: len(find_lowercase(x)))
after_stats_for_rows["NumSpecialChars"] = cleaned_df["text"].apply(lambda x: len(find_special_chars(x)))
after_stats_for_rows["Address"] = cleaned_df["text"].apply(lambda x: len(find_addresses(x)))
after_stats_for_rows["PhoneNum"] = cleaned_df["text"].apply(lambda x: len(find_phone_numbers(x)))
after_stats_for_rows["AccountNum"] = cleaned_df["text"].apply(lambda x: len(find_account_numbers(x)))

print(after_stats_for_rows.head(5))

aggregated_after_stats = after_stats_for_rows.agg(['sum', 'mean', 'max', 'min'])
print(aggregated_after_stats)

after_stats_end_time = time.time()


   WordCount  SentenceCount  AvgSentenceLen  MaxSentenceLen  MinSentenceLen  \
0         11              1              60              60              60   
1          6              1              27              27              27   
2          5              1              26              26              26   
3          7              1              30              30              30   
4          2              1              10              10              10   

   MaxWordLen  Emoticons  StopWords  NumLowerCase  NumSpecialChars  Address  \
0           6          0         27            50                0        0   
1           4          0         12            22                0        0   
2           5          0          9            22                0        0   
3           5          0          9            24                0        0   
4           5          0          1             9                0        0   

   PhoneNum  AccountNum  
0         0           0 

In [9]:
# Compare before and after stats
header = ["Stats", "Before", "After", "Diff"]
before_vocab_size = len(set([word for words in before_words for word in words]))
after_vocab_size = len(set([word for words in after_words for word in words]))

table = [
    ["Avg Sentence Length", aggregated_before_stats["AvgSentenceLen"]["mean"],
     aggregated_after_stats["AvgSentenceLen"]["mean"],
     aggregated_after_stats["AvgSentenceLen"]["mean"] - aggregated_before_stats["AvgSentenceLen"]["mean"]],
    ["Max Sentence Length", aggregated_before_stats["MaxSentenceLen"]["max"],
     aggregated_after_stats["MaxSentenceLen"]["max"],
     aggregated_after_stats["MaxSentenceLen"]["max"] - aggregated_before_stats["MaxSentenceLen"]["max"]],
    ["Min Sentence Length", aggregated_before_stats["MinSentenceLen"]["min"],
     aggregated_after_stats["MinSentenceLen"]["min"],
     aggregated_after_stats["MinSentenceLen"]["min"] - aggregated_before_stats["MinSentenceLen"]["min"]],
    ["Max Word Length", aggregated_before_stats["MaxWordLen"]["max"], aggregated_after_stats["MaxWordLen"]["max"],
     aggregated_after_stats["MaxWordLen"]["max"] - aggregated_before_stats["MaxWordLen"]["max"]],
    ["Word Count", aggregated_before_stats["WordCount"]["sum"], aggregated_after_stats["WordCount"]["sum"],
     aggregated_after_stats["WordCount"]["sum"] - aggregated_before_stats["WordCount"]["sum"]],
    ["Sentence Count", aggregated_before_stats["SentenceCount"]["sum"], aggregated_after_stats["SentenceCount"]["sum"],
     aggregated_after_stats["SentenceCount"]["sum"] - aggregated_before_stats["SentenceCount"]["sum"]],
    ["Vocab Size", before_vocab_size, after_vocab_size, after_vocab_size - before_vocab_size],
    ["Emoticons", aggregated_before_stats["Emoticons"]["sum"], aggregated_after_stats["Emoticons"]["sum"],
     aggregated_after_stats["Emoticons"]["sum"] - aggregated_before_stats["Emoticons"]["sum"]],
    ["Stop Words", aggregated_before_stats["StopWords"]["sum"], aggregated_after_stats["StopWords"]["sum"],
     aggregated_after_stats["StopWords"]["sum"] - aggregated_before_stats["StopWords"]["sum"]],
    ["Num Lower Case", aggregated_before_stats["NumLowerCase"]["sum"], aggregated_after_stats["NumLowerCase"]["sum"],
     aggregated_after_stats["NumLowerCase"]["sum"] - aggregated_before_stats["NumLowerCase"]["sum"]],
    ["Num Special Chars", aggregated_before_stats["NumSpecialChars"]["sum"],
     aggregated_after_stats["NumSpecialChars"]["sum"],
     aggregated_after_stats["NumSpecialChars"]["sum"] - aggregated_before_stats["NumSpecialChars"]["sum"]],
    ["Address", aggregated_before_stats["Address"]["sum"], aggregated_after_stats["Address"]["sum"],
     aggregated_after_stats["Address"]["sum"] - aggregated_before_stats["Address"]["sum"]],
    ["Phone Num", aggregated_before_stats["PhoneNum"]["sum"], aggregated_after_stats["PhoneNum"]["sum"],
     aggregated_after_stats["PhoneNum"]["sum"] - aggregated_before_stats["PhoneNum"]["sum"]],
    ["Account Num", aggregated_before_stats["AccountNum"]["sum"], aggregated_after_stats["AccountNum"]["sum"],
     aggregated_after_stats["AccountNum"]["sum"] - aggregated_before_stats["AccountNum"]["sum"]]
]
print("Stats Comparison Before and After Cleaning")
print(tabulate.tabulate(table, headers=header, tablefmt="pretty"))


Stats Comparison Before and After Cleaning
+---------------------+-------------------+-------------------+--------------------+
|        Stats        |      Before       |       After       |        Diff        |
+---------------------+-------------------+-------------------+--------------------+
| Avg Sentence Length | 48.43620461628908 | 29.05640842784777 | -19.37979618844131 |
| Max Sentence Length |       359.0       |       128.0       |       -231.0       |
| Min Sentence Length |        1.0        |        1.0        |        0.0         |
|   Max Word Length   |       124.0       |       22.0        |       -102.0       |
|     Word Count      |    21458036.0     |     8414769.0     |    -13043267.0     |
|   Sentence Count    |     2747515.0     |     1562107.0     |     -1185408.0     |
|     Vocab Size      |      859393       |       29412       |      -829981       |
|      Emoticons      |      16320.0      |        0.0        |      -16320.0      |
|     Stop Words      

In [10]:
# Readability Score

readability_score = cleaned_df["text"].apply(textstat.flesch_reading_ease).mean()

print(f"Readability Score: {readability_score}")

lexical_diversity = after_vocab_size / aggregated_after_stats["WordCount"]["sum"]

print(f"Lexical Diversity: {lexical_diversity}")

end_time = time.time()

print(f"Time taken for stats before cleaning: {round((before_stat_end_time - before_stat_start_time), 2)} seconds")
print(f"Time taken for cleaning: {round((cleaning_end_time - cleaning_start_time), 2)} seconds")
print(
    f"Time taken for stats after cleaning: {round((after_stats_end_time - after_stats_start_time), 2)} seconds")
print(f"Total time taken: {round((end_time - start_time), 2)} seconds")


Readability Score: 88.6915177449432
Lexical Diversity: 0.003495283114723648
Time taken for stats before cleaning: 23.21 seconds
Time taken for cleaning: 11.96 seconds
Time taken for stats after cleaning: 12.7 seconds
Total time taken: 66.2 seconds
