In [40]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pickle

df = pd.read_csv(r"C:\Users\admin\Downloads\CEAS_08.csv\File.csv")

df.head()
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')
df['label'].value_counts()

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

df['clean_text'] = df['text'].apply(clean_text)

df[['clean_text', 'label']].head()



X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)




model = LogisticRegression()
model.fit(X_train_vec, y_train)



with open("phishing_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))





feature_names = vectorizer.get_feature_names_out()

coefs = model.coef_[0]

top_phishing = np.argsort(coefs)[-10:]
print("🔴 أكثر الكلمات المرتبطة بالإيميلات التصيّدية:")
for i in top_phishing:
    print(feature_names[i], ":", coefs[i])



def predict_email(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "Phishing ⚠️" if prediction == 1 else "Legit ✅"

email = "Please update your password by clicking this secure link now!"
print(predict_email(email))



Accuracy: 0.9934874217852127

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3490
           1       0.99      1.00      0.99      4341

    accuracy                           0.99      7831
   macro avg       0.99      0.99      0.99      7831
weighted avg       0.99      0.99      0.99      7831

