In [2]:
# ThreatScan Project: Spam, Phishing, and Scam Detection

In [3]:
# First, install all the necessary libraries for the project
# Run this cell once to make sure your environment is set up
!pip install pandas numpy scikit-learn nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
## Part 1: Message Spam Detection
##This section loads the `spam.csv` dataset, trains a Naïve Bayes classifier to identify spam messages, and provides a function to test new messages.

In [5]:
# Import libraries for message spam detection
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

print("Libraries for message spam detection imported.")

Libraries for message spam detection imported.


In [15]:
# Load the dataset and preprocess the text data
try:
    # 1. Load the CSV without a header
    df_msg = pd.read_csv('spam.csv', encoding='latin-1', header=None)

    # 2. Select the first two columns
    df_msg = df_msg.iloc[:, :2]
    
    # 3. Rename the columns
    df_msg.columns = ['label', 'message']

    # 4. Map the values in the 'label' column
    df_msg['label'] = df_msg['label'].map({'ham': 0, 'spam': 1})

    # 5. Drop any rows where the label is now NaN (THIS IS THE FIX)
    df_msg.dropna(inplace=True)

    print("Spam dataset loaded and cleaned successfully.")
    display(df_msg.head())

except FileNotFoundError:
    print("Error: 'spam.csv' not found. Make sure it's in the same folder as this notebook.")
except Exception as e:
    print(f"An error occurred: {e}")

Spam dataset loaded and cleaned successfully.


Unnamed: 0,label,message
1,0.0,"Go until jurong point, crazy.. Available only ..."
2,0.0,Ok lar... Joking wif u oni...
3,1.0,Free entry in 2 a wkly comp to win FA Cup fina...
4,0.0,U dun say so early hor... U c already then say...
5,0.0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Split data, vectorize text, and train the model
if 'df_msg' in locals():
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df_msg['message'], df_msg['label'], test_size=0.2, random_state=42)

    # Vectorize the text data using CountVectorizer
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Initialize and train the Naive Bayes model
    spam_model = MultinomialNB()
    spam_model.fit(X_train_vec, y_train)

    # Make predictions and check the model's accuracy
    predictions = spam_model.predict(X_test_vec)
    print(f"Message Spam Model Accuracy: {accuracy_score(y_test, predictions):.2%}")

Message Spam Model Accuracy: 98.39%


In [17]:
# Create a function to predict any new message
def predict_message(message):
    """Takes a message string and predicts if it's Spam or Not Spam."""
    if 'spam_model' in locals():
        message_vec = vectorizer.transform([message])
        prediction = spam_model.predict(message_vec)
        return "Spam" if prediction[0] == 1 else "Not Spam"
    else:
        return "Model not trained yet."

# --- Test the message prediction function ---
test_message_1 = "Congratulations! You've won a $1000 Walmart gift card. Click here to claim."
print(f"The message '{test_message_1}' is: {predict_message(test_message_1)}")

test_message_2 = "Hey, are we still on for the meeting tomorrow at 2 PM?"
print(f"The message '{test_message_2}' is: {predict_message(test_message_2)}")

The message 'Congratulations! You've won a $1000 Walmart gift card. Click here to claim.' is: Model not trained yet.
The message 'Hey, are we still on for the meeting tomorrow at 2 PM?' is: Model not trained yet.


In [18]:
## Part 2: Phishing URL Detection

This section is for the logic from your `url_safety_checker.py` file. You will need to load the `malicious_phish.csv` dataset and copy your feature extraction and model training code here.

SyntaxError: invalid syntax (2094872278.py, line 3)

In [19]:
# --- PASTE YOUR URL DETECTION CODE FROM YOUR .py FILE HERE ---
# Example:
# 1. Load the 'malicious_phish.csv' dataset using pandas.
# 2. Define and apply your feature extraction functions.
# 3. Train your model.
# 4. Create your prediction function.

print("Placeholder: Add your URL detection code here.")

Placeholder: Add your URL detection code here.
