In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
import pickle

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [4]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [5]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)

In [6]:
encoder = LabelEncoder()
ps = PorterStemmer()

In [7]:
df['target'] = encoder.fit_transform(df['target'])

In [8]:
df = df.drop_duplicates(keep='first')

In [9]:
df['num_of_characters'] = df['text'].apply(len)

In [10]:
df['num_of_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

In [11]:
df['num_of_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [12]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)
    

In [13]:
df['transformed_text'] = df['text'].apply(transform_text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [15]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [16]:
y = df['target'].values

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [19]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [20]:
gnb.fit(X_train, y_train)
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)

In [21]:
y_pred1 = gnb.predict(X_test)
y_pred2 = mnb.predict(X_test)
y_pred3 = bnb.predict(X_test)
print("GaussianNB")
print(accuracy_score(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print(precision_score(y_test, y_pred1))
print("MultinomialNB")
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(precision_score(y_test, y_pred2))
print("BernoulliNB")
print(accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print(precision_score(y_test, y_pred3))

GaussianNB
0.8694390715667312
[[788 108]
 [ 27 111]]
0.5068493150684932
MultinomialNB
0.9709864603481625
[[896   0]
 [ 30 108]]
1.0
BernoulliNB
0.9835589941972921
[[895   1]
 [ 16 122]]
0.991869918699187


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
import xgboost as xgb

In [115]:
# Example usage of Logistic Regression
logistic_classifier = LogisticRegression(solver="liblinear", penalty='l1')

# Example usage of Support Vector Machine (SVM)
svm_classifier = SVC(kernel="sigmoid", gamma=1.0)

# Example usage of Decision Tree
decision_tree_classifier = DecisionTreeClassifier(max_depth=5)

# Example usage of k-Nearest Neighbors (KNN)
knn_classifier = KNeighborsClassifier()

# Example usage of Random Forest
random_forest_classifier = RandomForestClassifier(n_estimators=50, random_state=2)

# Example usage of AdaBoost
adaboost_classifier = AdaBoostClassifier(n_estimators=50, random_state=2)

# Example usage of Bagging Classifier
bagging_classifier = BaggingClassifier(n_estimators=50, random_state=2)

# Example usage of Extra Trees Classifier
extra_trees_classifier = ExtraTreesClassifier(n_estimators=50, random_state=2)

# Example usage of Gradient Boosting Classifier
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=50, random_state=2)

# Example usage of XGBoost
xgboost_classifier = xgb.XGBClassifier(n_estimators=50, random_state=2)

In [122]:
clfs = {
    "NB": mnb,
    "SVC": svm_classifier,
    "logis": logistic_classifier,
    "DCTree": decision_tree_classifier,
    "knn": knn_classifier,
    "forest": random_forest_classifier,
    "adaboost": adaboost_classifier,
    "bagging": bagging_classifier,
    "extraTree": extra_trees_classifier,
    "GBoost": gradient_boosting_classifier,
    "xgBoost": xgboost_classifier
}

In [123]:
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [124]:
accuracy = []
precision = []
for name, clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print("For - ", name)
    print("Accuracy - ", current_accuracy)
    print("Precision - ", current_precision)
    accuracy.append(current_accuracy)
    precision.append(current_precision)

For -  NB
Accuracy -  0.9709864603481625
Precision -  1.0
For -  SVC
Accuracy -  0.9758220502901354
Precision -  0.9747899159663865
For -  logis
Accuracy -  0.9584139264990329
Precision -  0.9702970297029703
For -  DCTree
Accuracy -  0.9303675048355899
Precision -  0.8173076923076923
For -  knn
Accuracy -  0.9052224371373307
Precision -  1.0
For -  forest
Accuracy -  0.9758220502901354
Precision -  0.9829059829059829
For -  adaboost
Accuracy -  0.960348162475822
Precision -  0.9292035398230089
For -  bagging
Accuracy -  0.9584139264990329
Precision -  0.8682170542635659
For -  extraTree
Accuracy -  0.9748549323017408
Precision -  0.9745762711864406
For -  GBoost
Accuracy -  0.9468085106382979
Precision -  0.9191919191919192
For -  xgBoost
Accuracy -  0.9671179883945842
Precision -  0.9262295081967213


In [126]:
performance_df = pd.DataFrame({"algoritm": clfs.keys(), "accuracy": accuracy, "precision": precision})

In [127]:
performance_df

Unnamed: 0,algoritm,accuracy,precision
0,NB,0.970986,1.0
1,SVC,0.975822,0.97479
2,logis,0.958414,0.970297
3,DCTree,0.930368,0.817308
4,knn,0.905222,1.0
5,forest,0.975822,0.982906
6,adaboost,0.960348,0.929204
7,bagging,0.958414,0.868217
8,extraTree,0.974855,0.974576
9,GBoost,0.946809,0.919192


In [22]:
import pickle
pickle.dump(tfidf, open("vectorizer.pkl", 'wb'))
pickle.dump(mnb, open("model.pkl", 'wb'))

In [21]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Assuming 'model' is your scikit-learn model
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]

# Convert the scikit-learn model to ONNX format
onnx_model = convert_sklearn(mnb, initial_types=initial_type)
vectorizer = convert_sklearn(tfidf, initial_types=initial_type)

# Save the ONNX model to a file
with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())
    
with open("vectorizer.onnx", "wb") as f:
    f.write(vectorizer.SerializeToString())

In [27]:
X.shape[1]

3000