In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the visualization
sns.set(style="whitegrid")

In [100]:
import pandas as pd


csv_file_path = r"C:\Users\acer\Downloads\SMSSpamCollection.csv"


labels = []
sms_messages = []

with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
    for line in file:
        
        label = line[:4].strip()
        message = line[4:].strip()
        labels.append(label)
        sms_messages.append(message)


df = pd.DataFrame({'Label': labels, 'SMS': sms_messages})


print(df)


     Label                                                SMS
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5569  spam  This is the 2nd time we have tried 2 contact u...
5570   ham               Will ü b going to esplanade fr home?
5571   ham  Pity, * was in mood for that. So...any other s...
5572   ham  The guy did some bitching but I acted like i'd...
5573   ham                         Rofl. Its true to its name

[5574 rows x 2 columns]


In [101]:

label_mapping = {'ham': 0, 'spam': 1}


df['Label'] = df['Label'].map(label_mapping)




In [102]:
print(df)

      Label                                                SMS
0         0  Go until jurong point, crazy.. Available only ...
1         0                      Ok lar... Joking wif u oni...
2         1  Free entry in 2 a wkly comp to win FA Cup fina...
3         0  U dun say so early hor... U c already then say...
4         0  Nah I don't think he goes to usf, he lives aro...
...     ...                                                ...
5569      1  This is the 2nd time we have tried 2 contact u...
5570      0               Will ü b going to esplanade fr home?
5571      0  Pity, * was in mood for that. So...any other s...
5572      0  The guy did some bitching but I acted like i'd...
5573      0                         Rofl. Its true to its name

[5574 rows x 2 columns]


In [127]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


def preprocess_text(text):
    text = text.lower()
    
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df['processed_text'] = df['SMS'].apply(preprocess_text)


def load_glove_model(file):
    print("Loading Glove Model")
    glove_model = {}
    with open(file, 'r', encoding='utf8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_path = r'C:\Users\acer\Downloads\glove.6B\glove.6B.100d.txt'  # Update with your path
glove_model = load_glove_model(glove_path) 





Loading Glove Model
400000 words loaded!


In [128]:


def text_to_glove_vector(text, glove_model):
    words = word_tokenize(text.lower())
    word_vectors = [glove_model[word] for word in words if word in glove_model]
    
    if not word_vectors:
        return np.zeros(100)  
    else:
        return np.mean(word_vectors, axis=0)


df['glove_vector'] = df['processed_text'].apply(lambda x: text_to_glove_vector(x, glove_model))

In [129]:
from sklearn.model_selection import train_test_split

X = np.stack(df['glove_vector'].values)  
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [130]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


parameters = {'C': [1, 10, 100], 'gamma': [0.01, 0.001, 0.0001]}
svm_model = SVC()
grid_search = GridSearchCV(svm_model, parameters, cv=3)
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_



from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report, accuracy_score 


y_scores = best_model.decision_function(X_test)
precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)


def adjusted_predict(model, X, threshold=0):
    scores = model.decision_function(X)
    return [1 if score > threshold else 0 for score in scores]


adjusted_pred = adjusted_predict(best_model, X_test, threshold=-0.5)  # Adjust threshold as needed
print("Adjusted Accuracy:", accuracy_score(y_test, adjusted_pred))
print(classification_report(y_test, adjusted_pred))




Adjusted Accuracy: 0.9617453676031081
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1447
           1       0.82      0.92      0.87       226

    accuracy                           0.96      1673
   macro avg       0.90      0.94      0.92      1673
weighted avg       0.96      0.96      0.96      1673



In [131]:

sensitive_keywords = ["password", "bank account", "credit card", "social security", 
                      "login details", "pin", "confirm password", "verify account"]

def contains_sensitive_info(sms, keywords):
    sms_lower = sms.lower()
    return any(keyword in sms_lower for keyword in keywords)

In [132]:
from textblob import TextBlob

def get_sentiment(sms):
    analysis = TextBlob(sms)
    return analysis.sentiment.polarity  


In [153]:
def predict_sms_with_threshold(sms, glove_model, model, threshold=0):
    if contains_sensitive_info(sms, sensitive_keywords):
        return 'Spam'
    
    sentiment_score = get_sentiment(sms)
    if sentiment_score < -0.5:  
        return 'Spam (Negative Sentiment)'
    
    sms_vector = text_to_glove_vector(sms, glove_model)
    score = model.decision_function([sms_vector])
    prediction = 1 if score > threshold else 0
    return 'Spam' if prediction == 1 else 'Ham'


sms = "Congratulations! You've been selected to win a free iPhone! Click the link to claim your prize now!"
threshold = -0.5 
prediction = predict_sms_with_threshold(sms, glove_model, best_model, threshold)
print(f"The message is classified as: {prediction}")


The message is classified as: Spam
