In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score
import pickle

In [8]:
#load dataset
dataframe = pd.read_csv('spam_ham_dataset.csv')
print(dataframe.describe())

        Unnamed: 0    label_num
count  5171.000000  5171.000000
mean   2585.000000     0.289886
std    1492.883452     0.453753
min       0.000000     0.000000
25%    1292.500000     0.000000
50%    2585.000000     0.000000
75%    3877.500000     1.000000
max    5170.000000     1.000000


In [9]:
#split data
x = dataframe["text"]
y = dataframe["label"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [10]:
# extract features
# cv = CountVectorizer()

# cv.fit(x_train)
# features = cv.transform(x_train)

# tuned_parameters = {'kernel': ['rbf', 'linear'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}
# model = GridSearchCV(svm.SVC(), tuned_parameters)
# model2 = MultinomialNB()
# model3 = RandomForestClassifier()
# model4 = LogisticRegression()

# model.fit(features, y_train)
# model2.fit(features, y_train)
# model3.fit(features, y_train)
# model4.fit(features, y_train)

In [11]:
cv_file = "cv.pkl"
svm_file = 'svm_model.pkl'
nb_file = 'nb_model.pkl'
rf_file = 'rf_model.pkl'
lr_file = 'lr_model.pkl'

# #dump models into pickle files for quick access pickle.dump(cv, open(cv_file, "wb"))
# pickle.dump(cv, open(cv_file, 'wb'))
# print('cv has loaded')

# pickle.dump(model, open(svm_file, 'wb'))
# pickle.dump(model2, open(nb_file, 'wb'))
# pickle.dump(model3, open(rf_file, 'wb'))
# pickle.dump(model4, open(lr_file, 'wb'))
# print( 'models are loaded')

In [12]:
cv = pickle.load(open(cv_file, 'rb'))
loaded_svm_model = pickle.load(open(svm_file, 'rb'))
loaded_nb_model = pickle.load(open(nb_file, 'rb'))
loaded_rf_model = pickle.load(open(rf_file, 'rb'))
loaded_lr_model = pickle.load(open(lr_file, 'rb'))
prediction_set = x_test.tolist()
svm_predicted_set = []
nb_predicted_set = []
rf_predicted_set = []
lr_predicted_set = []

for row in prediction_set:
  row = [row]
  transform = cv.transform(row)
  svm_result = loaded_svm_model.predict(transform)
  nb_result = loaded_nb_model.predict(transform)
  rf_result = loaded_rf_model.predict(transform)
  lr_result = loaded_lr_model.predict(transform)
  svm_predicted_set.append (svm_result)
  nb_predicted_set.append(nb_result)
  rf_predicted_set.append(rf_result)
  lr_predicted_set.append(lr_result)



In [13]:
svm__confusion_matrix = confusion_matrix(y_test, svm_predicted_set, labels=['spam', 'ham'])
svm_sensitivity = svm__confusion_matrix[0,0]/(svm__confusion_matrix[0,0]+svm__confusion_matrix[0,1])
svm_specificity = svm__confusion_matrix[1,1]/(svm__confusion_matrix[1,0]+svm__confusion_matrix[1,1])

nb_confusion_matrix = confusion_matrix(y_test, nb_predicted_set, labels=['spam', 'ham'])
nb_sensitivity = nb_confusion_matrix[0,0]/(nb_confusion_matrix[0,0]+nb_confusion_matrix[0,1])
nb_specificity = nb_confusion_matrix[1,1]/(nb_confusion_matrix[1,0]+nb_confusion_matrix[1,1])

rf_confusion_matrix = confusion_matrix(y_test, rf_predicted_set, labels=['spam', 'ham'])
rf_sensitivity = rf_confusion_matrix[0,0]/(rf_confusion_matrix[0,0]+rf_confusion_matrix[0,1])
rf_specificity = rf_confusion_matrix[1,1]/(rf_confusion_matrix[1,0]+rf_confusion_matrix[1,1])

lr_confusion_matrix = confusion_matrix(y_test, lr_predicted_set, labels=['spam', 'ham'])
lr_sensitivity = lr_confusion_matrix[0,0]/(lr_confusion_matrix[0,0]+lr_confusion_matrix[0,1])
lr_specificity = lr_confusion_matrix[1,1]/(lr_confusion_matrix[1,0]+lr_confusion_matrix[1,1])

print('SVM')
print('confusion matrix for SVM: '+ str(svm__confusion_matrix))
print('accuracy of svm = ' + str(loaded_svm_model.score(cv.transform(x_test), y_test)))
print('precision of svm for ham and spam = ' + str(precision_score(y_test, svm_predicted_set, average=None)))
print('recall of svm for ham and spam = '+ str(recall_score(y_test, svm_predicted_set, average=None)))
print('sens and spec for svm model = ' + str(svm_sensitivity) + ' ' + str(svm_specificity))
print('f1 score for svm = ' + str(f1_score(y_test, svm_predicted_set, average=None)))
print('***********************')

print('Naive Bayes')
print('confusion matrix for Naive Bayes: '+ str(nb_confusion_matrix))
print('accuracy of naive bayes = ' + str(loaded_nb_model.score(cv.transform(x_test), y_test)))
print('precision of naive bayes for ham and spam = ' + str(precision_score(y_test, nb_predicted_set, average=None)))
print('recall of naive bayes for ham and spam = '+ str(recall_score(y_test, nb_predicted_set, average=None)))
print('sens and spec for naive bayes model = ' + str(nb_sensitivity) + ' ' + str(nb_specificity))
print('f1 score for naive bayes = ' + str(f1_score(y_test, nb_predicted_set, average=None)))
print('***********************')

print('Random Forest')
print('confusion matrix for Random Forest: '+ str(rf_confusion_matrix))
print('accuracy of Random Forest = ' + str(loaded_rf_model.score(cv.transform(x_test), y_test)))
print('precision of Random Forest for ham and spam = ' + str(precision_score(y_test, rf_predicted_set, average=None)))
print('recall of Random Forest for ham and spam = '+ str(recall_score(y_test, rf_predicted_set, average=None)))
print('sens and spec for Random Forest model = ' + str(rf_sensitivity) + ' ' + str(rf_specificity))
print('f1 score for Random Forest = ' + str(f1_score(y_test, rf_predicted_set, average=None)))
print('***********************')

print('Logistic Regression')
print('confusion matrix for Logistic Regression: '+ str(lr_confusion_matrix))
print('accuracy of Logistic Regression = ' + str(loaded_lr_model.score(cv.transform(x_test), y_test)))
print('precision of Logistic Regression for ham and spam = ' + str(precision_score(y_test, lr_predicted_set, average=None)))
print('recall of Logistic Regression for ham and spam = '+ str(recall_score(y_test, lr_predicted_set, average=None)))
print('sens and spec for Logistic Regression model = ' + str(lr_sensitivity) + ' ' + str(lr_specificity))
print('f1 score for Logistic Regression = ' + str(f1_score(y_test, lr_predicted_set, average=None)))
print('***********************')

SVM
confusion matrix for SVM: [[ 435   26]
 [  34 1212]]
accuracy of svm = 0.9648506151142355
precision of svm for ham and spam = [0.97899838 0.92750533]
recall of svm for ham and spam = [0.97271268 0.94360087]
sens and spec for svm model = 0.9436008676789588 0.9727126805778491
f1 score for svm = [0.97584541 0.93548387]
***********************
Naive Bayes
confusion matrix for Naive Bayes: [[ 439   22]
 [  18 1228]]
accuracy of naive bayes = 0.9765670767428236
precision of naive bayes for ham and spam = [0.9824     0.96061269]
recall of naive bayes for ham and spam = [0.98555377 0.95227766]
sens and spec for naive bayes model = 0.9522776572668112 0.985553772070626
f1 score for naive bayes = [0.98397436 0.95642702]
***********************
Random Forest
confusion matrix for Random Forest: [[ 427   34]
 [  22 1224]]
accuracy of Random Forest = 0.9671939074399531
precision of Random Forest for ham and spam = [0.97297297 0.95100223]
recall of Random Forest for ham and spam = [0.9823435  0.92

In [15]:

import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')
# Read the HTML file
with open('test.html', 'r') as file:
  html_content = file.read()

# Remove HTML tags
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()

# Remove punctuation and convert to lowercase
text = re.sub(r'[^\w\s]', '', text.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_text = ' '.join(word for word in text.split() if word not in stop_words)

# Print the filtered text
print(filtered_text)
transform = cv.transform([filtered_text])
result1 = loaded_svm_model.predict(transform)
result2 = loaded_nb_model.predict(transform)
result3 = loaded_rf_model.predict(transform)
result4 = loaded_lr_model.predict(transform)
print('SVM: ' + str(result1))
print('Naive Bayes: ' + str(result2))
print('Random Forest: ' + str(result3))
print('Logistic Regression: ' + str(result4))

month going teach thing two may signals end school year upon us procrastinate papers assignments grading making glitch app thatteaches thing whether tutorial guides fan art favorite fun fact month building apps make others say today learned head glitchcomjams right learn jams participate see wha tthe community made past jams well inspiration month join friday 2pm eastern glitch jams live ill showing last months submissions looking inspiration month happy jamming cant wait see create see glitchcomjams jenn director community
SVM: ['spam']
Naive Bayes: ['spam']
Random Forest: ['ham']
Logistic Regression: ['spam']
