<a href="https://colab.research.google.com/github/jagdaleyash/AIphishingdet/blob/main/AI_generated_phishing_detection_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize


# Download NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')

# Load the data
data = pd.read_csv('Phishing_paper1.csv', header=None, names=['url', 'label'])

# Drop the first row as it contains column names
data = data.drop(data.index[0])

# Remove rows with missing values
data = data.dropna()

# Convert label column to numeric values
data['label'] = pd.to_numeric(data['label'])

# Replace NaN values with empty strings
data['url'].fillna('', inplace=True)

# Convert url column to string
data['url'] = data['url'].astype(str)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv('Phishing_paper1.csv', header=None, names=['url', 'label'])


In [23]:
# Define function to clean url and detect AI-generated phishing links
def clean_url(url):
    url = url.lower()
    url = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))','URL',url)
    url = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 'IPADDRESS', url)
    url = re.sub(r'[^\w\s]','',url)
    url = re.sub(r'\s+', ' ', url)

    # AI detection logic
    ai_keywords = ['ai', 'deepfake', 'neural', 'model', 'generated', 'automated']
    if any(keyword in url for keyword in ai_keywords):
        url = re.sub(r'ai|deepfake|neural|model|generated|automated', 'AIKEYWORD', url)

    return url

def style_ai_detection(email_content):
    # Placeholder example: Analyze the email content's use of punctuation and capitalization
    # If the content exhibits very formal, overly precise punctuation and capitalization patterns, it might indicate AI
    # Note: A more sophisticated analysis would involve training a model on real AI-generated content.

    # Sample patterns to detect: excessive use of exclamation marks, formal capitalization of every word, etc.
    ai_patterns = [r'\!\!+', r'^[A-Z][a-z]*\s' * len(email_content.split()) + r'[\.!?]$']
    ai_detected = any(re.search(pattern, email_content) for pattern in ai_patterns)

    return ai_detected

def content_source_analysis(email_content):
    # Placeholder example: Check if the email content contains links from known reliable sources
    # AI-generated content might frequently use obscure or unverified sources

    # Sample list of reputable domains
    reputable_domains = ['nytimes.com', 'bbc.com', 'wikipedia.org']
    links = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', email_content)

    ai_detected = any(domain not in link for link in links for domain in reputable_domains)

    return ai_detected

def linguistic_analysis(email_content):
    # Placeholder example: Check if the email content contains overly complex sentence structures
    # AI-generated content might overuse complex and unnatural sentence structures

    # Sample indicators: Long sentences, many clauses, excessive semicolons
    sentence_tokens = sent_tokenize(email_content)
    complex_sentence_threshold = 0.3
    complex_sentences = [sentence for sentence in sentence_tokens if len(sentence.split()) > 20]

    ai_detected = len(complex_sentences) / len(sentence_tokens) > complex_sentence_threshold

    return ai_detected


# Define function to perform AI content detection
def detect_ai_content(email_content):
    # Preprocess the email content
    email_content = clean_url(email_content)
    email_tokens = tokenize_url(email_content)
    email_tokens = remove_stop_words(email_tokens)
    email_content = ' '.join(email_tokens)

    # Check for AI patterns using keywords
    ai_keywords = ['ai', 'deepfake', 'neural', 'model', 'generated', 'automated']
    ai_presence = any(keyword in email_content for keyword in ai_keywords)

    # Additional AI detection strategies
    style_ai_detection_result = style_ai_detection(email_content)  # Replace with your style analysis function
    content_source_analysis_result = content_source_analysis(email_content)  # Replace with your content source analysis function
    linguistic_analysis_result = linguistic_analysis(email_content)  # Replace with your linguistic analysis function

    # Combine results from different detection strategies
    ai_detected = ai_presence or style_ai_detection_result or content_source_analysis_result or linguistic_analysis_result

    return ai_detected

data['url'] = data['url'].apply(clean_url)

# Tokenize the urls
def tokenize_url(url):
    tokens = word_tokenize(url)
    return tokens

data['tokens'] = data['url'].apply(tokenize_url)

# Remove stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

data['tokens'] = data['tokens'].apply(remove_stop_words)

from sklearn.feature_extraction.text import CountVectorizer

# create CountVectorizer object
vectorizer = CountVectorizer()

# join the tokens into a single string
data['tokens'] = data['tokens'].apply(lambda x: ' '.join(x))

# fit and transform the tokenized URLs
X = vectorizer.fit_transform(data['tokens'])

# print the shape of the vectorized URLs
print('Vectorized URLs shape:', X.shape)

Vectorized URLs shape: (133803, 1787)


In [24]:
from sklearn.model_selection import train_test_split

# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.3, random_state=42)

# print the shape of the training and testing sets
print('Training set shape:', X_train.shape, y_train.shape)
print('Testing set shape:', X_test.shape, y_test.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# create logistic regression model with a high maximum number of iterations
logreg = LogisticRegression(max_iter=10000)

# fit the model to the training data
logreg.fit(X_train, y_train)

# predict on the testing data
y_pred = logreg.predict(X_test)

# calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# print accuracy score
print('Accuracy:', accuracy)

# create logistic regression model with a high maximum number of iterations
logreg = LogisticRegression(max_iter=10000)

# fit the model to the entire preprocessed dataset
logreg.fit(X_train, y_train)

Training set shape: (93662, 1787) (93662,)
Testing set shape: (40141, 1787) (40141,)
Accuracy: 0.9374704167808475


In [25]:
# preprocess the email
email = 'http://example.com'

email = clean_url(email)
tokens = tokenize_url(email)
tokens = remove_stop_words(tokens)
email = ' '.join(tokens)

# vectorize the preprocessed email
email_vector = vectorizer.transform([email])

# predict whether the email is phishing or not using the trained model
prediction = logreg.predict(email_vector)

if prediction == 1:
    print('This email is a phishing email')
else:
    print('This email is not a phishing email')

from sklearn.metrics import classification_report

This email is not a phishing email


In [26]:
# compute classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     37631
         1.0       0.00      0.00      0.00      2510

    accuracy                           0.94     40141
   macro avg       0.47      0.50      0.48     40141
weighted avg       0.88      0.94      0.91     40141



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# Example email content
email_content = "Dear customer, we have detected suspicious activity on your account. Please click on the link to verify your account: http://example.com/verify?id=1kjjk23"

# Detect AI content
ai_detected = detect_ai_content(email_content)

if ai_detected:
    print("The email content indicates AI-generated.")
else:
    print("The email content does not indicate AI-generated.")


The email content does not indicate AI-generated.


In [28]:
# example email
email = "Dear customer, we have detected suspicious activity on your account. Please click on the link to verify your account: http://example.com/verify?id=1kjjk23"

# preprocess the email
email = clean_url(email)
tokens = tokenize_url(email)
tokens = remove_stop_words(tokens)
email = ' '.join(tokens)

# vectorize the email using the trained vectorizer
email_vector = vectorizer.transform([email])

# predict whether the email is phishing or not using the trained model
prediction = logreg.predict(email_vector)

if prediction == 1:
    print("The email is a phishing email.")
else:
    print("The email is not a phishing email.")

The email is not a phishing email.
