### Importing libraries

In [34]:
from sklearn.naive_bayes import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import time

### Test combinations of classifiers and vectorizers

In [35]:

"""
testing combinations of classifiers and vectorizers
1. tries combinatios of classifiers and vectorizers in scikit-learn lib
2. trains classifier on vectorized training data
3. tests on test data
4. prints classification score
v1 = first label 'ham' or 'spam'
v2 = actual message
"""
def perform(classifiers, vectorizers, train_data, test_data):
    for classifier in classifiers:
        for vectorizer in vectorizers:
            start = time.time()

            # vectorize to transform text to numerical vectors
            X_train = vectorizer.fit_transform(train_data.v2)

            # fit classifier on vectorized data with labels
            classifier.fit(X_train, train_data.v1)

            # vectorize test data and show score for each classifier
            X_test = vectorizer.transform(test_data.v2)
            classification_score = classifier.score(X_test, test_data.v1)

            end = time.time()
            elapsed_time = end - start
            score_string = f"{classifier.__class__.__name__} with {vectorizer.__class__.__name__}, Score: {classification_score}, Time: {elapsed_time:.2f}s"
            print(score_string)


In [36]:
# divide data-set 80/20 train/test
data = pd.read_csv('spam.csv', encoding='latin-1')

train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['v1'])

### Training and testing cominbations of classifiers and vectorizers for this dataset

In [37]:
classifiers = [
    BernoulliNB(),
    LogisticRegression(max_iter=1000),
    PassiveAggressiveClassifier(),
    RidgeClassifier(),
    SGDClassifier(),
    OneVsRestClassifier(SVC(kernel='linear', probability=True)),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MultinomialNB(),

]

vectorizers = [
    CountVectorizer(),
    TfidfVectorizer(),
]

perform(classifiers, vectorizers, train_data, test_data)

BernoulliNB with CountVectorizer, Score: 0.9721973094170404, Time: 0.28s
BernoulliNB with TfidfVectorizer, Score: 0.9721973094170404, Time: 0.41s
LogisticRegression with CountVectorizer, Score: 0.9847533632286996, Time: 0.22s
LogisticRegression with TfidfVectorizer, Score: 0.9695067264573991, Time: 0.21s
PassiveAggressiveClassifier with CountVectorizer, Score: 0.9856502242152466, Time: 0.16s
PassiveAggressiveClassifier with TfidfVectorizer, Score: 0.9865470852017937, Time: 0.11s
RidgeClassifier with CountVectorizer, Score: 0.9739910313901345, Time: 0.17s
RidgeClassifier with TfidfVectorizer, Score: 0.9874439461883409, Time: 0.14s
SGDClassifier with CountVectorizer, Score: 0.97847533632287, Time: 0.11s
SGDClassifier with TfidfVectorizer, Score: 0.9901345291479821, Time: 0.12s
OneVsRestClassifier with CountVectorizer, Score: 0.9847533632286996, Time: 3.36s
OneVsRestClassifier with TfidfVectorizer, Score: 0.9910313901345291, Time: 5.61s
KNeighborsClassifier with CountVectorizer, Score: 0.

### results

```
BernoulliNB with CountVectorizer, Score: 0.9721973094170404, Time: 0.28s
BernoulliNB with TfidfVectorizer, Score: 0.9721973094170404, Time: 0.41s
LogisticRegression with CountVectorizer, Score: 0.9847533632286996, Time: 0.22s
LogisticRegression with TfidfVectorizer, Score: 0.9695067264573991, Time: 0.21s
PassiveAggressiveClassifier with CountVectorizer, Score: 0.9856502242152466, Time: 0.16s
PassiveAggressiveClassifier with TfidfVectorizer, Score: 0.9865470852017937, Time: 0.11s
RidgeClassifier with CountVectorizer, Score: 0.9739910313901345, Time: 0.17s
RidgeClassifier with TfidfVectorizer, Score: 0.9874439461883409, Time: 0.14s
SGDClassifier with CountVectorizer, Score: 0.97847533632287, Time: 0.11s
SGDClassifier with TfidfVectorizer, Score: 0.9901345291479821, Time: 0.12s
OneVsRestClassifier with CountVectorizer, Score: 0.9847533632286996, Time: 3.36s
OneVsRestClassifier with TfidfVectorizer, Score: 0.9910313901345291, Time: 5.61s
KNeighborsClassifier with CountVectorizer, Score: 0.9192825112107623, Time: 0.38s
KNeighborsClassifier with TfidfVectorizer, Score: 0.9121076233183857, Time: 0.27s
DecisionTreeClassifier with CountVectorizer, Score: 0.9739910313901345, Time: 0.31s
DecisionTreeClassifier with TfidfVectorizer, Score: 0.95695067264574, Time: 0.54s
RandomForestClassifier with CountVectorizer, Score: 0.9713004484304932, Time: 1.46s
RandomForestClassifier with TfidfVectorizer, Score: 0.9739910313901345, Time: 1.62s
MultinomialNB with CountVectorizer, Score: 0.9847533632286996, Time: 0.11s
MultinomialNB with TfidfVectorizer, Score: 0.9560538116591928, Time: 0.13s
```



### Using OneVsRest classifier with tf-idf  vectorizer

In [38]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [39]:
data = pd.read_csv("spam.csv", encoding='latin-1')
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['v1'])

### Saving detailed results for running model on test set in a csv file

In [41]:
classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
vectorizer = TfidfVectorizer()

# transform text to numerical vectors for classifier
X_train = vectorizer.fit_transform(train_data.v2)
# labels
y_train = train_data.v1

# trains support vector machine on vectorized data + labels
classifier.fit(X_train, y_train)

# transform test data
X_test = vectorizer.transform(test_data.v2)
# labels
y_test = test_data.v1

# score
print(classifier.score(X_test, y_test))

csv_array = []
for index, row in test_data.iterrows():
  answer = row[0]
  text = row[1]
  vectorize_text = vectorizer.transform([text])
  predict = classifier.predict(vectorize_text)[0]
  result = "correct" if predict == answer else "incorrect"
  csv_array.append([index, text, answer, predict, result])

correct_count = sum(1 for row in csv_array if row[4] == "correct")
incorrect_count = sum(1 for row in csv_array if row[4] == "incorrect")

print(f"Correct predictions: {correct_count}")
print(f"Incorrect predictions: {incorrect_count}")

# save test results to csv file
with open('test_results.csv', 'w', newline='') as csvfile:
  writer = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  writer.writerow(["id", "text", "answer", "predict", "result"])
  for row in csv_array:
    writer.writerow(row)





0.9820627802690582


  answer = row[0]
  text = row[1]


Correct predictions: 1095
Incorrect predictions: 20
