In [94]:
import pandas as pd
import spacy
import re
import emoji
import csv
from spacy.pipeline.lemmatizer import lemmatizer_score

dataset = pd.read_csv("./spam.csv")
print(dataset.describe())
dataset = dataset.dropna(axis=1)
dataset.head(10)

file_to_write = open("./spam_cleaned.csv", "w")
new_csv = csv.writer(file_to_write)
new_csv.writerow(["label", "text", 'len'])

# Load the model
nlp = spacy.load("en_core_web_md")

          v1                      v2  \
count   5572                    5572   
unique     2                    5169   
top      ham  Sorry, I'll call later   
freq    4825                      30   

                                               Unnamed: 2  \
count                                                  50   
unique                                                 43   
top      bt not his girlfrnd... G o o d n i g h t . . .@"   
freq                                                    3   

                   Unnamed: 3 Unnamed: 4  
count                      12          6  
unique                     10          5  
top      MK17 92H. 450Ppw 16"    GNT:-)"  
freq                        2          2  


In [95]:
# Filter unwanted tokens, special chars, numbers, and extra spaces
# also remove stop words, as well as converting to lowercase
# it will also remove emoticon as it is a punctuation
def clean_token(doc):
    cleaned_tokens = []
    for token in doc:
        if (not token.is_punct and not token.is_space
                and not token.is_stop and not token.is_quote and token.is_ascii and not token.like_num):
            # lemmatize the token
            lemmatized_token = token.lemma_
            cleaned_tokens.append(lemmatized_token.replace("--", "").removesuffix("-").removeprefix("-").lower())
    return cleaned_tokens

In [96]:
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

def remove_non_english_words(tokens):
    return [token for token in tokens if token in words]

[nltk_data] Downloading package words to
[nltk_data]     /Users/khantzawhein/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [97]:
def process_text(text):
    # Remove HTML like tags
    regex_remove_html = re.compile(r'<.*?>')
    text = regex_remove_html.sub('', text)
    # convert unicode emojis to words
    text = emoji.demojize(text)
    doc = nlp(text)
    cleaned_sentences = []
    for sent in doc.sents:
        cleaned_tokens = clean_token(sent)
        cleaned_tokens = remove_non_english_words(cleaned_tokens)
        cleaned_sentences.append(" ".join(cleaned_tokens))
    return " ".join(cleaned_sentences)

for index, review in enumerate(dataset['v2']):
    processed_text = process_text(review)
    new_csv.writerow([dataset.get('v1')[index], processed_text, len(dataset.get('v2')[index]) / 1000])
    # new_csv.writerow([[int(label == "spam") for label in dataset["v1"]][index], processed_text, len(dataset.get('v2')[index]) / 1000])

In [98]:
file_to_write.close()