In [1]:
import cudf as pd
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet,stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml.feature_extraction._tfidf_vectorizer import TfidfTransformer
import string
from collections import Counter
from nltk.util import ngrams
from symspellpy import SymSpell, Verbosity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import cupy as np
import matplotlib.pyplot as plt
import os

In [2]:
if os.path.exists("~/nltk_data/corpora/stopwords.zip") == False:
    nltk.download("stopwords")

if os.path.exists("~/nltk_data/sentiment/vader_lexicon.zip") == False:
    nltk.download("vader_lexicon")

if os.path.exists("~/nltk_data/corpora/wordnet.zip") == False:
    nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
help(TfidfTransformer)

Help on class TfidfTransformer in module cuml.feature_extraction._tfidf:

class TfidfTransformer(cuml.internals.base.Base)
 |  TfidfTransformer(*, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False, handle=None, verbose=False, output_type=None)
 |
 |  Transform a count matrix to a normalized tf or tf-idf representation
 |  Tf means term-frequency while tf-idf means term-frequency times inverse
 |  document-frequency. This is a common term weighting scheme in information
 |  retrieval, that has also found good use in document classification.
 |  The goal of using tf-idf instead of the raw frequencies of occurrence of a
 |  token in a given document is to scale down the impact of tokens that occur
 |  very frequently in a given corpus and that are hence empirically less
 |  informative than features that occur in a small fraction of the training
 |  corpus.
 |  The formula that is used to compute the tf-idf for a term t of a document d
 |  in a document set is tf-idf(t, d) = tf

In [4]:
df = pd.read_csv("Spam_SMS.csv")

In [5]:
df.shape

(5574, 2)

In [6]:
class Transformer():
    def __doc__(self):
        """
            Transformer Class to clean and prepare SMS Ham data:

            Attributes:
            -----------
                self.spam_bank (list):

                self.ham_bank  (list):

                self.spam_urls (list):

                self.ham_urls  (list):

                self.features  (pd.DataFrame):

                self.df  (pd.DataFrame):

                self.stopwords (list):

                self.lemmatizer (nltk.stem.WordNetLemmatizer):

                self.sia (nltk.sentiment.vader.SentimentIntensityAnalyzer):

                self.contractions (dict):

            Methods:
            --------
                self.tokenize_words(self, message: str) -> list:
        """

    def __init__(self, df: pd.DataFrame):
        self.spam_bank = [l for l in df[df["Class"] == "spam"]['Message']]
        # self.spam = ' '.join(df[df['Class'] == 'spam']["Message"])
        self.ham_bank = [l for l in df[df["Class"] == "ham"]["Message"]]
        self.spam_urls = [url for msg in self.spam_bank for url in re.findall(r'http[s]:\/\/[\S]+', msg)]
        self.ham_urls = [url for msg in self.ham_bank for url in re.findall(r'http[s]:\/\/[\S]+', msg)]
        self.features = pd.DataFrame({}).to_pandas()
        self.stopwords = set(stopwords.words('english'))
        self.df = df.copy()
        self.lemmatizer = WordNetLemmatizer()
        self.sia = SentimentIntensityAnalyzer()
        self.contractions = {
                                "can't": "cannot",
                                "won't": "will not",
                                "n't": " not",
                                "'re": " are",
                                "'s": " is",
                                "'d": " would",
                                "'ll": " will",
                                "'ve": " have",
                                "'m": " am",
                            }

    def tokenize_words(self, message: str) -> list:
        tokenizer = RegexpTokenizer(r"[^\s.,?!]+")
        tokens = tokenizer.tokenize(message)
        tokens = [t.lower() for t in tokens]
        
        return tokens
    
    def expand_contractions(self, text, contractions_dict):
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), 
                                        flags=re.IGNORECASE | re.DOTALL)
        def replace(match):
            return contractions_dict[match.group(0).lower()]
        
        return contractions_pattern.sub(replace, text)
    
    def clean_msg(self):
        self.features['clean_msg'] = self.df['Message'].str.lower()
        self.features["target"] = self.df["Class"].apply(lambda x: 1 if x == "spam" else 0)
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: self.expand_contractions(x, self.contractions))
        self.features['clean_msg'] = self.features['clean_msg'].str.replace(r'http[s]:\/\/[\S]+', '<url>', regex=True)
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
        
        sym_spell = SymSpell(max_dictionary_edit_distance=4, prefix_length=7)
        dictionary = "frequency_dictionary_en_82_765.txt"
        sym_spell.load_dictionary(dictionary,term_index=0,count_index=1)

        def correct(msg: str) -> str:
            suggestions = []
            for word in msg.split():
                suggestion = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
                if suggestion:
                    suggestions.append(suggestion[0].term)
                else:
                    suggestions.append(word)

            return " ".join(suggestions)
        
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: correct(x))
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: " ".join([word for word in x.split() if word not in self.stopwords]))
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: " ".join([self.lemmatizer.lemmatize(word) for word in x.split()]))
        self.features['clean_msg'] = self.features['clean_msg'].str.replace(r'\d+', '<num>', regex=True)
        self.features['clean_msg'] = self.features['clean_msg'].str.strip()
        self.features['clean_msg'] = self.features['clean_msg'].apply(lambda x: " ".join([word for word in x.split() if len(word) > 2]))

        return self.features
    
    def ngrams(self):
        spam_blob = self.features[self.features["target"]==1]["clean_msg"].str.cat()
        ham_blob = self.features[self.features["target"]==0]["clean_msg"].str.cat()

        spam_tokens = self.tokenize_words(spam_blob)
        ham_tokens = self.tokenize_words(ham_blob)

        spam_bigrams = Counter(list(ngrams(spam_tokens,2)))
        spam_trigrams = Counter(list(ngrams(spam_tokens,3)))

        ham_bigrams = Counter(list(ngrams(ham_tokens,2)))
        ham_trigrams = Counter(list(ngrams(ham_tokens,3)))

        return spam_bigrams, spam_trigrams, ham_bigrams, ham_trigrams

    
    def feature_eng(self):
        self.features['char_count'] = self.df['Message'].apply(len)
        self.features['word_count'] = self.features['clean_msg'].apply(lambda msg: len(self.tokenize_words(msg)))
        self.features['digit_count'] = self.df['Message'].apply(lambda x: sum(c.isdigit() for c in x))
        self.features['question_count'] = self.df['Message'].apply(lambda x: x.count('?'))
        self.features['exclamation_count'] = self.df['Message'].apply(lambda x: x.count('!'))
        self.features['dollar_count'] = self.df['Message'].apply(lambda x: x.count('$') + x.count('€') + x.count('£'))
        self.features['cap_ratio'] = self.df['Message'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
        self.features['unique_words'] = self.features['clean_msg'].apply(lambda x: len(set(x.split())))
        self.features['repitition_factor'] = self.features['word_count'].astype(float) / self.features['unique_words'].astype(float)
        self.features['sentiment'] = self.features['clean_msg'].apply(lambda x: self.sia.polarity_scores(x)['compound'])

        self.features.to_csv("prepared_data.csv")

        return self.features
    
    def word_count(self, word_bank: list) -> dict:
        pattern = r"[^\s./!?]+"
        tokenizer = RegexpTokenizer(pattern)
        counts = list()
        for msg in word_bank:
            words_count = dict()
            words = tokenizer.tokenize(msg)
            for word in words:
                if words_count.keys().__contains__(word) == False:
                    words_count[word] = words.count(word)
                else:
                    continue
            counts.append(words_count)

        return counts
    
    def cap_count(self, tokens: list) -> int:
        count = int(0)
        for word in tokens:
            if word.isupper() == True:
                count += 1
            else:
                continue
        
        return count
    

In [7]:
f = Transformer(df.to_pandas())
f.clean_msg()

Unnamed: 0,clean_msg,target
0,point crazy available bug great world buffet c...,0
1,lar joking,0
2,free entry wkly comp win cup final tit list ma...,1
3,dun say early hor already say,0
4,nah think life around though,0
...,...,...
5569,time tried contact £<num> pound prize claim ea...,1
5570,going esplanade home,0
5571,pity mood sony suggestion,0
5572,guy bitching acted like would interested buyin...,0


In [8]:
spam_bigrams, spam_trigrams, ham_bigrams, ham_trigrams = f.ngrams()

In [9]:
type(spam_bigrams.values())

dict_values

In [14]:
df_spam_bigrams = pd.DataFrame({"bigram": str(spam_bigrams.keys()), "count": spam_bigrams.values()})
df_spam_bigrams.to_csv("spam_bigrams.csv")
df_spam_trigrams = pd.DataFrame({"trigram": str(spam_trigrams.keys()), "count": spam_trigrams.values()})
df_spam_trigrams.to_csv("spam_trigrams.csv")
df_ham_bigrams = pd.DataFrame({"bigrams": str(ham_bigrams.keys()), "count": ham_bigrams.values()})
df_ham_bigrams.to_csv("ham_bigrams.csv")
df_ham_trigrams = pd.DataFrame({"trigrams": str(ham_trigrams.keys()), "count": ham_trigrams.values()})
df_ham_trigrams.to_csv("ham_trigrams.csv")

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/ibrahim/anaconda3/envs/rapids/include/rmm/mr/device/cuda_memory_resource.hpp:62: cudaErrorMemoryAllocation out of memory

In [33]:
df_spam_bigrams

Unnamed: 0,bigram,count
0,"[free, entry]",8
1,"[entry, wkly]",5
2,"[wkly, comp]",4
3,"[comp, win]",2
4,"[win, cup]",2
...,...,...
5425,"[name, house]",1
5426,"[house, postcodetime]",1
5427,"[postcodetime, tried]",1
5428,"[£<num>, pound]",1


In [10]:
f.feature_eng()

Unnamed: 0,clean_msg,target,char_count,word_count,digit_count,question_count,exclamation_count,dollar_count,cap_ratio,unique_words,repitition_factor,sentiment
0,point crazy available bug great world buffet c...,0,111,10,0,0,0,0,0.027027,10,1.000000,0.4019
1,lar joking,0,29,2,0,0,0,0,0.068966,2,1.000000,0.2263
2,free entry wkly comp win cup final tit list ma...,1,155,20,25,0,0,0,0.064516,17,1.176471,0.7964
3,dun say early hor already say,0,49,6,0,0,0,0,0.040816,5,1.200000,0.0000
4,nah think life around though,0,61,5,0,0,0,0,0.032787,5,1.000000,-0.1027
...,...,...,...,...,...,...,...,...,...,...,...,...
5569,time tried contact £<num> pound prize claim ea...,1,160,13,21,0,1,1,0.056250,13,1.000000,0.7351
5570,going esplanade home,0,36,3,0,1,0,0,0.027778,3,1.000000,0.0000
5571,pity mood sony suggestion,0,57,4,0,1,0,0,0.035088,4,1.000000,-0.2960
5572,guy bitching acted like would interested buyin...,0,125,13,0,0,0,0,0.016000,13,1.000000,0.7506


In [None]:
sample = df_spam_bigrams.sort_values(by="count",ascending=False).head(15)
sample["bigram"] = sample["bigram"].apply(lambda x: x[0] + " " + x[1])
plt.barh(y = sample["bigram"].values, width = sample["count"],alpha=0.85)
plt.xlabel("Count of Bigrams")
plt.title("Most Commonly used Bigrams in Spam Messages")
plt.ylabel("Bigram")
plt.grid(True,linestyle='-.')
plt.savefig("Most Commonly used Bigrams in Spam Messages")
plt.show()

In [None]:
import seaborn as sns



In [None]:
f.features["sentiment"].values

In [24]:
description = f.features.describe()

In [None]:
f.features[f.features["question_count"] > 1]["clean_msg"].index

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import N
from sklearn.metrics import classification_report,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import mlflow
import seaborn as sns
from mlflow import create_experiment
import mlflow.sklearn

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.create_experiment("spams")
mlflow.set_experiment("spams")
mlflow.sklearn.autolog(disable=True)

In [13]:
spam_corpus = f.features[f.features["target"]==1]["clean_msg"].to_list()
tf = TfidfVectorizer()
tf.fit(spam_corpus)
spam_vocab = pd.DataFrame()
words = list()
counts = list()
idf = list()
for k,v,i in zip(tf.vocabulary_.keys(), tf.vocabulary_.values(),tf.idf_):
    words.append(k)
    counts.append(v)
    idf.append(i)

spam_vocab["word"] = words
spam_vocab["count"] = counts
spam_vocab["idf"] = idf

In [None]:
spam_vocab.sort_values(by="idf", ascending=False)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
            f.features["clean_msg"],
            f.features["target"],
            random_state=42,
            stratify=f.features["target"]
        )

In [24]:
f.features

Unnamed: 0,clean_msg,target,char_count,word_count,digit_count,question_count,exclamation_count,dollar_count,cap_ratio,unique_words,repitition_factor,sentiment
0,point crazy available bug great world buffet c...,0,111,10,0,0,0,0,0.027027,10,1.000000,0.4019
1,lar joking,0,29,2,0,0,0,0,0.068966,2,1.000000,0.2263
2,free entry wkly comp win cup final tit list ma...,1,155,20,25,0,0,0,0.064516,17,1.176471,0.7964
3,dun say early hor already say,0,49,6,0,0,0,0,0.040816,5,1.200000,0.0000
4,nah think life around though,0,61,5,0,0,0,0,0.032787,5,1.000000,-0.1027
...,...,...,...,...,...,...,...,...,...,...,...,...
5569,time tried contact £<num> pound prize claim ea...,1,160,13,21,0,1,1,0.056250,13,1.000000,0.7351
5570,going esplanade home,0,36,3,0,1,0,0,0.027778,3,1.000000,0.0000
5571,pity mood sony suggestion,0,57,4,0,1,0,0,0.035088,4,1.000000,-0.2960
5572,guy bitching acted like would interested buyin...,0,125,13,0,0,0,0,0.016000,13,1.000000,0.7506


In [1]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svc", SVC())
])

pipe.fit(X_train, y_train)

NameError: name 'Pipeline' is not defined

In [23]:
y_train

4925    0
2349    0
1396    0
475     0
1473    0
       ..
3838    0
423     0
3824    0
1836    0
61      0
Name: target, Length: 4180, dtype: int64

In [13]:
class SVClassifier():
    def __init__(self):
        f.features = f.features.fillna(0)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            f.features["clean_msg"],
            f.features["target"],
            random_state=42,
            stratify=f.features["target"]
        )

        self.pipe = Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("svc", SVC())
        ])

        self.grid = {
            "tfidf__max_df": [0.8,0.9],
            "tfidf__ngram_range": [(1,1), (1,2), (2,2), (2,3)],
            "svc__C": [0.1,1,10],
            "svc__kernel": ["linear","rbf"],
            "svc__gamma": ["scale", "auto"]
        }

        self.grid_search = GridSearchCV(
            estimator=self.pipe,
            param_grid=self.grid,
            scoring=["recall","precision","f1"],
            verbose=3,
            refit="recall",
            return_train_score=True,
            cv=4
        )

    def tuning(self):
        self.grid_search.fit(self.X_train, self.y_train)

    def train(self):
        with mlflow.start_run(run_name="svc_model_tuning"):
            mlflow.set_tag("model_name","svc")

            self.tuning()

            mlflow.log_param(self.grid_search.best_params_)
            mlflow.log_metric("Best Recall Score", self.grid_search.best_score_)

            results_df = pd.DataFrame(self.grid_search.cv_results_)
            mlflow.log_params(results_df.to_dict())

            for i in range(0, len(results_df["params"])):
                mlflow.log_metrics({"svc_mean_test_recall": results_df.at[i,"mean_test_recall"],
                                    "svc_mean_test_precision": results_df.at[i,"mean_test_precision"],
                                    "svc_mean_test_f1": results_df.at[i,"mean_test_f1"],
                                    "svc_mean_train_recall": results_df.at[i,"mean_train_recall"],
                                    "svc_mean_train_precision": results_df.at[i,"mean_train_precision"],
                                    "svc_mean_train_f1": results_df.at[i,"mean_train_f1"]}, step=i)
                
            y_pred = self.grid_search.best_estimator_.predict(X_test)
                
            report = classification_report(y_test, y_pred, output_dict=True)
            report = pd.DataFrame(report).transpose().to_csv("svc_classification_report.csv")
            mlflow.log_artifact("svc_classification_report.csv")

            pivot_table = results_df[['params','split0_test_recall', 'split1_test_recall',
                'split2_test_recall', 'split3_test_recall',
                'split4_test_recall', 'mean_test_recall']]
            pivot_table["params"] = pivot_table.index
            plt.figure(figsize=(8, 6))
            sns.heatmap(data=pivot_table, annot=True, cmap="YlGnBu")
            plt.title("SVC Grid Search Results - Recall")
            plt.savefig("grid_svc_heatmap.png")
            mlflow.log_artifact(local_path="grid_svc_heatmap.png", artifact_path="grid_svc_heatmap.png")

            mlflow.sklearn.log_model(self.grid_search.best_estimator_, artifact_path="best_svc_model")




In [14]:
svc = SVClassifier()
svc.train()

Fitting 4 folds for each of 96 candidates, totalling 384 fits
[CV 1/4] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear, tfidf__max_df=0.8, tfidf__ngram_range=(1, 1); f1: (train=0.879, test=0.841) precision: (train=0.982, test=0.981) recall: (train=0.795, test=0.736) total time=   0.5s
[CV 2/4] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear, tfidf__max_df=0.8, tfidf__ngram_range=(1, 1); f1: (train=0.877, test=0.884) precision: (train=0.985, test=1.000) recall: (train=0.790, test=0.793) total time=   0.5s
[CV 3/4] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear, tfidf__max_df=0.8, tfidf__ngram_range=(1, 1); f1: (train=0.874, test=0.865) precision: (train=0.988, test=0.973) recall: (train=0.783, test=0.779) total time=   0.4s
[CV 4/4] END svc__C=0.1, svc__gamma=scale, svc__kernel=linear, tfidf__max_df=0.8, tfidf__ngram_range=(1, 1); f1: (train=0.872, test=0.879) precision: (train=0.994, test=0.935) recall: (train=0.776, test=0.829) total time=   0.5s
[CV 1/4] END svc__C=0.

KeyboardInterrupt: 

In [None]:
grid = {
    # "n_neighbors": range(3,16,2),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto","ball_tree","kd_tree","brute"],
    "n_jobs": range(2,8,2)
}

grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=grid,
    scoring=["recall","precision","f1"],
    verbose=2,
    refit="recall",
    return_train_score=True
)


with mlflow.start_run(run_name="knn_model_tuning") as run:
    mlflow.set_tag("model_name", "knn")

    grid_search.fit(X_train, y_train)

    mlflow.log_param('KNN Best Parameters', grid_search.best_params_)
    mlflow.log_metric('KNN Best Score', grid_search.best_score_)

    results_df = pd.DataFrame(grid_search.cv_results_)
    mlflow.log_param('All Results KNN', results_df.to_dict())

    for i in range(0, len(results_df["params"])):
        mlflow.log_metrics({"knn_mean_test_recall": results_df.at[i,"mean_test_recall"],
                            "knn_mean_test_precision": results_df.at[i,"mean_test_precision"],
                            "knn_mean_test_f1": results_df.at[i,"mean_test_f1"],
                            "knn_mean_train_recall": results_df.at[i,"mean_train_recall"],
                            "knn_mean_train_precision": results_df.at[i,"mean_train_precision"],
                            "knn_mean_train_f1": results_df.at[i,"mean_train_f1"]}, step=i)

    y_pred = grid_search.best_estimator_.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric('KNN Test Accuracy', test_accuracy)
    
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_csv = "classification_report.csv"
    report_df.to_csv(report_csv)
    mlflow.log_artifact(report_csv)

    pivot_table = results_df[['params','split0_test_recall', 'split1_test_recall',
       'split2_test_recall', 'split3_test_recall',
       'split4_test_recall', 'mean_test_recall']]
    pivot_table["params"] = pivot_table.index
    plt.figure(figsize=(8, 6))
    sns.heatmap(data=pivot_table, annot=True, cmap="YlGnBu")
    plt.title("KNN Grid Search Results - Recall")
    plt.savefig("KNN_grid_search_heatmap.png")
    mlflow.log_artifact(local_path="KNN_grid_search_heatmap.png", artifact_path="KNN_grid_search_heatmap.png")

    mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="best_knn_model")

In [None]:
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(
    df_features
)

In [None]:
pivot_table["params"] = pivot_table["params"].astype(str)

In [None]:
pivot_table.params.dtype

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)

print(cross_val_score(knn, X_train, y_train, cv=5, scoring='precision'))

print(classification_report(y_test, y_preds))

In [None]:
len(a[a==False])

In [None]:
f.features.iloc[3625,1]