In [1]:
import cudf as pd
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet,stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml.feature_extraction._tfidf_vectorizer import TfidfTransformer
import string
from collections import Counter
from nltk.util import ngrams
from symspellpy import SymSpell, Verbosity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import cupy as np
import matplotlib.pyplot as plt
import os

In [2]:
if os.path.exists("~/nltk_data/corpora/stopwords.zip") == False:
    nltk.download("stopwords")

if os.path.exists("~/nltk_data/sentiment/vader_lexicon.zip") == False:
    nltk.download("vader_lexicon")

if os.path.exists("~/nltk_data/corpora/wordnet.zip") == False:
    nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_csv("Spam_SMS.csv")

In [6]:
df.shape

(5574, 2)

In [7]:
class Transformer():
    def __doc__(self):
        """
            Transformer Class to clean and prepare SMS Ham data:

            Attributes:
            -----------
                self.spam_bank (list):

                self.ham_bank  (list):

                self.spam_urls (list):

                self.ham_urls  (list):

                self.features  (pd.DataFrame):

                self.df  (pd.DataFrame):

                self.stopwords (list):

                self.lemmatizer (nltk.stem.WordNetLemmatizer):

                self.sia (nltk.sentiment.vader.SentimentIntensityAnalyzer):

                self.contractions (dict):

            Methods:
            --------
                self.tokenize_words(self, message: str) -> list:
        """

    def __init__(self, df: pd.DataFrame):
        self.spam_bank = [l for l in df[df["Class"] == "spam"]['Message']]
        # self.spam = ' '.join(df[df['Class'] == 'spam']["Message"])
        self.ham_bank = [l for l in df[df["Class"] == "ham"]["Message"]]
        self.spam_urls = [url for msg in self.spam_bank for url in re.findall(r'http[s]:\/\/[\S]+', msg)]
        self.ham_urls = [url for msg in self.ham_bank for url in re.findall(r'http[s]:\/\/[\S]+', msg)]
        self.features = pd.DataFrame({}).to_pandas()
        self.stopwords = set(stopwords.words('english'))
        self.df = df.copy()
        self.lemmatizer = WordNetLemmatizer()
        self.sia = SentimentIntensityAnalyzer()
        self.contractions = {
                                "can't": "cannot",
                                "won't": "will not",
                                "n't": " not",
                                "'re": " are",
                                "'s": " is",
                                "'d": " would",
                                "'ll": " will",
                                "'ve": " have",
                                "'m": " am",
                            }

    def tokenize_words(self, message: str) -> list:
        tokenizer = RegexpTokenizer(r"[^\s.,?!]+")
        tokens = tokenizer.tokenize(message)
        tokens = [t.lower() for t in tokens]
        
        return tokens
    
    def expand_contractions(self, text, contractions_dict):
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), 
                                        flags=re.IGNORECASE | re.DOTALL)
        def replace(match):
            return contractions_dict[match.group(0).lower()]
        
        return contractions_pattern.sub(replace, text)
    
    def clean_msg(self):
        self.features['clean_msg'] = self.df['Message'].str.lower()
        self.features["target"] = self.df["Class"].apply(lambda x: 1 if x == "spam" else 0)
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: self.expand_contractions(x, self.contractions))
        self.features['clean_msg'] = self.features['clean_msg'].str.replace(r'http[s]:\/\/[\S]+', '<url>', regex=True)
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
        
        sym_spell = SymSpell(max_dictionary_edit_distance=4, prefix_length=7)
        dictionary = "frequency_dictionary_en_82_765.txt"
        sym_spell.load_dictionary(dictionary,term_index=0,count_index=1)

        def correct(msg: str) -> str:
            suggestions = []
            for word in msg.split():
                suggestion = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
                if suggestion:
                    suggestions.append(suggestion[0].term)
                else:
                    suggestions.append(word)

            return " ".join(suggestions)
        
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: correct(x))
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: " ".join([word for word in x.split() if word not in self.stopwords]))
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: " ".join([self.lemmatizer.lemmatize(word) for word in x.split()]))
        self.features['clean_msg'] = self.features['clean_msg'].str.replace(r'\d+', '<num>', regex=True)
        self.features['clean_msg'] = self.features['clean_msg'].str.strip()
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: " ".join([word for word in x.split() if len(word) > 2]))

        return self.features
    
    def ngrams(self):
        spam_blob = self.features[self.features["target"]==1]["clean_msg"].str.cat()
        ham_blob = self.features[self.features["target"]==0]["clean_msg"].str.cat()

        spam_tokens = self.tokenize_words(spam_blob)
        ham_tokens = self.tokenize_words(ham_blob)

        spam_bigrams = Counter(list(ngrams(spam_tokens,2)))
        spam_trigrams = Counter(list(ngrams(spam_tokens,3)))

        ham_bigrams = Counter(list(ngrams(ham_tokens,2)))
        ham_trigrams = Counter(list(ngrams(ham_tokens,3)))

        return spam_bigrams, spam_trigrams, ham_bigrams, ham_trigrams

    
    def feature_eng(self):
        self.features['char_count'] = self.df['Message'].apply(len)
        self.features['word_count'] = self.features['clean_msg'].apply(lambda msg: len(self.tokenize_words(msg)))
        self.features['digit_count'] = self.df['Message'].apply(lambda x: sum(c.isdigit() for c in x))
        self.features['question_count'] = self.df['Message'].apply(lambda x: x.count('?'))
        self.features['exclamation_count'] = self.df['Message'].apply(lambda x: x.count('!'))
        self.features['dollar_count'] = self.df['Message'].apply(lambda x: x.count('$') + x.count('€') + x.count('£'))
        self.features['cap_ratio'] = self.df['Message'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
        self.features['unique_words'] = self.features['clean_msg'].apply(lambda x: len(set(x.split())))
        self.features['repitition_factor'] = self.features['word_count'].astype(float) / self.features['unique_words'].astype(float)
        self.features['sentiment'] = self.features['clean_msg'].apply(lambda x: self.sia.polarity_scores(x)['compound'])

        self.features.to_csv("prepared_data.csv")

        return self.features
    
    def word_count(self, word_bank: list) -> dict:
        pattern = r"[^\s./!?]+"
        tokenizer = RegexpTokenizer(pattern)
        counts = list()
        for msg in word_bank:
            words_count = dict()
            words = tokenizer.tokenize(msg)
            for word in words:
                if words_count.keys().__contains__(word) == False:
                    words_count[word] = words.count(word)
                else:
                    continue
            counts.append(words_count)

        return counts
    
    def cap_count(self, tokens: list) -> int:
        count = int(0)
        for word in tokens:
            if word.isupper() == True:
                count += 1
            else:
                continue
        
        return count
    

In [8]:
f = Transformer(df.to_pandas())
f.clean_msg()

Unnamed: 0,clean_msg,target
0,point crazy available bug great world buffet c...,0
1,lar joking,0
2,free entry wkly comp win cup final tit list ma...,1
3,dun say early hor already say,0
4,nah think life around though,0
...,...,...
5569,time tried contact £<num> pound prize claim ea...,1
5570,going esplanade home,0
5571,pity mood sony suggestion,0
5572,guy bitching acted like would interested buyin...,0


In [9]:
spam_bigrams, spam_trigrams, ham_bigrams, ham_trigrams = f.ngrams()

In [10]:
type(spam_bigrams.values())

dict_values

In [11]:
import pandas as pd

df_spam_bigrams = pd.DataFrame({"bigram": spam_bigrams.keys(), "count": spam_bigrams.values()})
df_spam_bigrams.to_csv("spam_bigrams.csv")
df_spam_trigrams = pd.DataFrame({"trigram": spam_trigrams.keys(), "count": spam_trigrams.values()})
df_spam_trigrams.to_csv("spam_trigrams.csv")
df_ham_bigrams = pd.DataFrame({"bigrams": ham_bigrams.keys(), "count": ham_bigrams.values()})
df_ham_bigrams.to_csv("ham_bigrams.csv")
df_ham_trigrams = pd.DataFrame({"trigrams": ham_trigrams.keys(), "count": ham_trigrams.values()})
df_ham_trigrams.to_csv("ham_trigrams.csv")

import cudf as pd

In [12]:
df_spam_bigrams

Unnamed: 0,bigram,count
0,"(free, entry)",8
1,"(entry, wkly)",5
2,"(wkly, comp)",4
3,"(comp, win)",2
4,"(win, cup)",2
...,...,...
5425,"(name, house)",1
5426,"(house, postcodetime)",1
5427,"(postcodetime, tried)",1
5428,"(£<num>, pound)",1


In [13]:
f.feature_eng()

Unnamed: 0,clean_msg,target,char_count,word_count,digit_count,question_count,exclamation_count,dollar_count,cap_ratio,unique_words,repitition_factor,sentiment
0,point crazy available bug great world buffet c...,0,111,10,0,0,0,0,0.027027,10,1.000000,0.4019
1,lar joking,0,29,2,0,0,0,0,0.068966,2,1.000000,0.2263
2,free entry wkly comp win cup final tit list ma...,1,155,20,25,0,0,0,0.064516,17,1.176471,0.7964
3,dun say early hor already say,0,49,6,0,0,0,0,0.040816,5,1.200000,0.0000
4,nah think life around though,0,61,5,0,0,0,0,0.032787,5,1.000000,-0.1027
...,...,...,...,...,...,...,...,...,...,...,...,...
5569,time tried contact £<num> pound prize claim ea...,1,160,13,21,0,1,1,0.056250,13,1.000000,0.7351
5570,going esplanade home,0,36,3,0,1,0,0,0.027778,3,1.000000,0.0000
5571,pity mood sony suggestion,0,57,4,0,1,0,0,0.035088,4,1.000000,-0.2960
5572,guy bitching acted like would interested buyin...,0,125,13,0,0,0,0,0.016000,13,1.000000,0.7506
