In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib

In [17]:
df = pd.read_csv('CEAS_08.csv')


In [18]:
df.shape



(39154, 7)

In [19]:
# Fill NaN values with an empty string
df['body'].fillna('', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['body'].fillna('', inplace=True)


In [20]:
df.loc[df['Category'] == 'spam', 'Category',] = 1
df.loc[df['Category'] == 'ham', 'Category',] = 0


KeyError: 'Category'

In [21]:
X = df['body']
Y = df['label']

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)


In [23]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


Model training

In [24]:
model = LogisticRegression()
model.fit(X_train_features, Y_train)

In [25]:
from sklearn.metrics import accuracy_score, classification_report

prediction = model.predict(X_test_features)
accuuracy_on_test = accuracy_score(Y_test, prediction)

In [26]:

print(accuuracy_on_test)

0.993359724173158


In [27]:
import joblib

joblib.dump(model, 'phishing_model.pkl')
joblib.dump(feature_extraction, 'vectorizer.pkl')


['vectorizer.pkl']

In [29]:
# Load model and vectorizer
model = joblib.load('phishing_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Sample data for testing
sample_texts = ["This is a phishing email. Click here to win a million dollars.", 
                "Meeting at 10am tomorrow in the office."]
sample_features = vectorizer.transform(sample_texts)
predictions = model.predict(sample_features)

for text, prediction in zip(sample_texts, predictions):
    print(f'Email: {text}\nPrediction: {"Phishing" if prediction == 1 else "Safe"}\n')


Email: This is a phishing email. Click here to win a million dollars.
Prediction: Phishing

Email: Meeting at 10am tomorrow in the office.
Prediction: Phishing

