# Spam Classifier
---

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import nltk

In [None]:
import os 
os.listdir("Assets")

In [None]:
df = pd.read_csv("Assets\\spam.csv",encoding='ISO-8859-1')
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.columns = ['target', 'document']
df.head()

In [None]:
sns.catplot(data = df, y = "target", kind = "count")
plt.show()

In [None]:
print("Spam: ", len(df[df['target'] == 'spam']))
print("Ham: ", len(df[df['target'] == 'ham']))

**We have a problem in here because spam messages are underrepresented.**

## Creating the Model
---


### Data Cleaning

In [None]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Creating an instance of the class
lemmatizer = WordNetLemmatizer()

corpus = []
# clean the dataset
for i in range(len(df)):
    doc = df.iloc[i, 1].lower()
    doc = re.sub('[^a-zA-Z]', ' ', doc)
    doc = doc.split()

    word = [lemmatizer.lemmatize(word) for word in doc if word not in set(stopwords.words("english"))]
    corpus.append(' '.join(word))
corpus[0:5]

In [None]:
# Creating TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tv = TfidfVectorizer(max_features = 5000)
X = tv.fit_transform(corpus).toarray()
y = pd.get_dummies(df['target'], drop_first = True).values.reshape(-1)

print(X[0:5])
print(X.shape)
print(y[0:5])
print(y.shape)

### Preprocessing the Data and Finding the Best Algorithm

In [None]:
# Import classification algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# Import the preprocessing tools
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {"knn": KNeighborsClassifier(), "logreg": LogisticRegression(), "DTC": DecisionTreeClassifier()}
results = []

for model in models.values():
    kf = KFold(n_splits = 6, random_state = 42, shuffle = True)
    results.append(cross_val_score(model, X_train_scaled, y_train, cv = kf))

plt.boxplot(results, labels = models.keys())
plt.show()
print(results)

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    print(name + ": " + str(test_score))

### Hyperparamter Tuning the Logistic Algorithm

In [None]:
from sklearn.model_selection import RandomizedSearchCV
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

logreg = LogisticRegression()

params = [{'penalty':['l1','l2','elasticnet', 'None'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga']}]

cv = RandomizedSearchCV(logreg, params, cv = kf, n_iter = 2)
cv.fit(X_train_scaled, y_train)
print(cv.best_params_, cv.bes_score_)

In [None]:
# from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# y_pred = 
# cm = confusion_matrix(y_test, y_pred)
# disp =  ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot()