In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1) Loading Data

In [2]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

train_df.head()


Unnamed: 0,label,text,cleaned_text,preprocessed_tokens,punctuation_percentage,num_chars,num_sentences,num_words,num_misspellings,misspelling_percentage,...,emotions,deception_score,money_score,payment_score,celebration_score,achievement_score,url_presence,phone_number_presence,binary_label,pos_verbs_percentage
0,spam,"calgary , alberta , jul 7 , 2005 ( ccnmatthews...","calgary , alberta , jul 7 , 2005 ( ccnmatthews...","['calgari', 'alberta', 'jul', 'ccnmatthew', 'v...",2.566049,0.028841,0.01715,0.026634,38,0.038,...,"{'help': 0.004336513443191674, 'office': 0.011...",0.000867,0.015611,0.01301,0.001735,0.002602,0,0,1,0.152
1,ham,"louise , as of today there is $ 722 , 572 in s...","louise , as of today there is $ 722 , 572 in s...","['louis', 'today', 'schedul', 'c', 'tbg', 'set...",3.571429,0.002575,0.002309,0.00293,8,0.072727,...,"{'help': 0.0, 'office': 0.031007751937984496, ...",0.0,0.0,0.0,0.0,0.0,0,0,0,0.190909
2,ham,Lmao but its so fun...,Lmao but its so fun...,"['lmao', 'fun']",13.636364,9.6e-05,0.00033,0.00016,2,0.333333,...,"{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'mo...",0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
3,ham,hi : here ' s the presentation . - - - - - ori...,hi : here ' s the presentation . - - - - - ori...,"['hi', 'present', 'origin', 'messag', 'kitchen...",8.040712,0.008605,0.004947,0.007484,84,0.298932,...,"{'help': 0.0, 'office': 0.011441647597254004, ...",0.0,0.002288,0.0,0.011442,0.0,0,0,0,0.113879
4,ham,fraud,fraud,['fraud'],0.0,2.2e-05,0.00033,2.7e-05,0,0.0,...,"{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'mo...",1.0,0.0,0.0,0.0,0.0,0,0,0,0.0


In [3]:
features = ['num_sentences', 'misspelling_percentage', 'pos_verbs_percentage',
             'spaces_percentage', 'sentiment_score', 'money_score', 'payment_score',
             'celebration_score', 'achievement_score', 'url_presence',
             'phone_number_presence']

train_text_data = train_df['cleaned_text'].astype(str) #can change to preprocessed_tokens if u want
train_numerical_features = train_df[features].values
train_labels = train_df['binary_label']

test_text_data = test_df['cleaned_text'].astype(str)
test_numerical_features = test_df[features].values
test_labels = test_df['binary_label']

# 2) Multinominal Naives Bayes Classification

In [4]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_text_vector = vectorizer.fit_transform(train_text_data)
X_test_text_vector = vectorizer.transform(test_text_data)

# Combine text and numerical features for both train and test sets
X_train = hstack((X_train_text_vector, train_numerical_features))
X_test = hstack((X_test_text_vector, test_numerical_features))


y_train = train_labels
y_test = test_labels

In [5]:

clf = MultinomialNB()
nb_model = clf.fit(X_train, y_train)

In [6]:

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9227659395266571
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.93      7169
           1       0.86      0.96      0.91      4704

    accuracy                           0.92     11873
   macro avg       0.92      0.93      0.92     11873
weighted avg       0.93      0.92      0.92     11873



# 3) Log Regression

In [7]:


# Train Logistic Regression model
logreg = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
lr_model = logreg.fit(X_train, y_train)

# Predict on test data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9687526320222353
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      7169
           1       0.98      0.94      0.96      4704

    accuracy                           0.97     11873
   macro avg       0.97      0.96      0.97     11873
weighted avg       0.97      0.97      0.97     11873



# 4) Saving and loading

In [8]:
from joblib import dump,load

# Assuming you have trained Naive Bayes model named nb_model and Logistic Regression model named lr_model

# Save Naive Bayes model
dump(nb_model, 'naive_bayes_model.joblib')

# # Save Logistic Regression model
dump(lr_model, 'logistic_regression_model.joblib')

# Load Naive Bayes model
nb_model = load('naive_bayes_model.joblib')

# Load Logistic Regression model
lr_model = load('logistic_regression_model.joblib')

In [9]:
y_pred = lr_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9687526320222353
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      7169
           1       0.98      0.94      0.96      4704

    accuracy                           0.97     11873
   macro avg       0.97      0.96      0.97     11873
weighted avg       0.97      0.97      0.97     11873

