In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
# Load the dataset
df = pd.read_csv('mail_data.csv')
# Describe the dataset
print(f"Number of instances (emails): {df.shape[0]}")
print(f"Attributes: {df.columns.tolist()}")
print(df.isnull().sum())
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\W+', ' ', text)   # Remove special characters and digits
    text = text.lower()                # Convert text to lowercase
    words = text.split()               # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stop words and lemmatize
    return ' '.join(words)
df['processed_text'] = df['Message'].apply(preprocess_text)
df['label'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
# Display the first few rows of the processed dataset
print(df.head())
# Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']
print(X.shape)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define hyperparameter grids for each classifier
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear']
    },
    "Naive Bayes": {
        'alpha': [0.01, 0.1, 1, 10]
    },
    "Support Vector Machine": {
        'C': [0.01, 0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30]
    },
    "Random Forest": {
        'n_estimators': [50, 100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20]
    },
    "k-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4]
    }
}
# Define a dictionary of classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Support Vector Machine": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "k-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}
# Evaluate and optimize each classifier
best_estimators = {}

for name, clf in classifiers.items():
    print(f"\n{name} - Before Optimization")
    
    # Evaluate before optimization
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1] if hasattr(clf, 'predict_proba') else [0]*len(y_pred)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Optimize with GridSearchCV
    print(f"\n{name} - After Optimization")
    param_grid = param_grids[name]
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    best_clf = grid_search.best_estimator_
    best_estimators[name] = best_clf
    
    # Evaluate after optimization
    best_clf.fit(X_train, y_train)
    
    y_pred = best_clf.predict(X_test)
    y_prob = best_clf.predict_proba(X_test)[:, 1] if hasattr(best_clf, 'predict_proba') else [0]*len(y_pred)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
# Initialize a dictionary to store performance metrics
metrics = {
    "Classifier": [],
    "Accuracy": []
}
import matplotlib.pyplot as plt
# Evaluate each classifier before optimization
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store the metrics
    metrics["Classifier"].append(name)
    metrics["Accuracy"].append(accuracy)

# Create a DataFrame from the metrics dictionary
metrics_df = pd.DataFrame(metrics)

# Plot the accuracy metrics
plt.figure(figsize=(10, 6))
bars = plt.bar(metrics_df["Classifier"], metrics_df["Accuracy"], color='b')
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.title('Classifier Accuracy Before Optimization')
plt.xticks(rotation=45, ha="right")

# Add the values on top of the bars
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height:.4f}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),  # 3 points vertical offset
                 textcoords="offset points",
                 ha='center', va='bottom')

plt.tight_layout()
plt.show()
# Initialize a dictionary to store performance metrics
metrics = {
    "Classifier": [],
    "Before Optimization": [],
    "After Optimization": []
}

# Evaluate and optimize each classifier
best_estimators = {}

for name, clf in classifiers.items():
    # Evaluate before optimization
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy_before = accuracy_score(y_test, y_pred)
    
    # Store the metrics before optimization
    metrics["Classifier"].append(name)
    metrics["Before Optimization"].append(accuracy_before)
    
    # Optimize with GridSearchCV
    param_grid = param_grids[name]
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_clf = grid_search.best_estimator_
    best_estimators[name] = best_clf
    
    # Evaluate after optimization
    y_pred = best_clf.predict(X_test)
    accuracy_after = accuracy_score(y_test, y_pred)
    
    # Store the metrics after optimization
    metrics["After Optimization"].append(accuracy_after)

# Create a DataFrame from the metrics dictionary
metrics_df = pd.DataFrame(metrics)

# Plot the accuracy metrics for each classifier
for classifier in metrics_df["Classifier"]:
    before = metrics_df.loc[metrics_df["Classifier"] == classifier, "Before Optimization"].values[0]
    after = metrics_df.loc[metrics_df["Classifier"] == classifier, "After Optimization"].values[0]
    
    plt.figure(figsize=(6, 4))
    bars = plt.bar(["Before Optimization", "After Optimization"], [before, after], color=['b', 'g'])
    plt.xlabel('Optimization Status')
    plt.ylabel('Accuracy')
    plt.title(f'{classifier} Accuracy Before and After Optimization')
    
    # Add the values on top of the bars
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.4f}',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),  # 3 points vertical offset
                     textcoords="offset points",
                     ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
!pip install joblib
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X, y)
import joblib

# Save the trained model
joblib.dump(random_forest_model, 'random_forest_model.pkl')
# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
# Function to check if the email is spam
def check_spam(email_text):
    processed_input_mail = preprocess_text(email_text)
    input_data_features = vectorizer.transform([processed_input_mail])

    predict = random_forest_model.predict(input_data_features)

    return predict[0] == 1

# User Input for Spam Detection using RandomForestClassifier
input_mail = input("Enter your mail content: ")
is_spam = check_spam(input_mail)

if is_spam:
    print("It is a spam mail")
else:
    print("It is not a spam mail")
pip install termcolor
import imaplib
import email
from email.header import decode_header
import getpass
import joblib
import re
from termcolor import colored

# Load the pre-trained Random Forest model and the vectorizer
try:
    random_forest_model = joblib.load('random_forest_model.pkl')
    vectorizer = joblib.load('vectorizer.pkl')
except FileNotFoundError as e:
    print(f"Model or vectorizer file not found: {e}")
    exit(1)

# Function to preprocess the email text for spam detection
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-letters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Function to check if the email is spam
def check_spam(email_text):
    processed_input_mail = preprocess_text(email_text)
    input_data_features = vectorizer.transform([processed_input_mail])

    predict = random_forest_model.predict(input_data_features)

    return predict[0] == 1

# Function to fetch emails from Gmail using IMAP
def fetch_emails(username, password, count=5):
    try:
        # Connect to Gmail's IMAP server
        mail = imaplib.IMAP4_SSL('imap.gmail.com')
        mail.login(username, password)
        
        # Select the inbox
        mail.select('inbox')
        
        # Search for all emails and fetch the most recent ones
        status, data = mail.search(None, 'ALL')
        mail_ids = data[0].split()

        # Fetch the most recent 'count' emails
        for i in range(-1, -(count+1), -1):
            if abs(i) <= len(mail_ids):
                status, data = mail.fetch(mail_ids[i], '(RFC822)')
                for response_part in data:
                    if isinstance(response_part, tuple):
                        msg = email.message_from_bytes(response_part[1])
                        
                        # Decode the email subject
                        subject, encoding = decode_header(msg["Subject"])[0]
                        if isinstance(subject, bytes):
                            subject = subject.decode(encoding if encoding else 'utf-8')
                        
                        print(f"Email {abs(i)}: Subject - {subject}")

                        # Fetch the email body
                        body = ""
                        if msg.is_multipart():
                            for part in msg.walk():
                                content_type = part.get_content_type()
                                content_disposition = str(part.get("Content-Disposition"))

                                if "attachment" not in content_disposition:
                                    # Get the email body
                                    payload = part.get_payload(decode=True)
                                    if content_type == "text/plain" and payload:
                                        body += payload.decode('utf-8') + "\n"
                        else:
                            # If the email is not multipart
                            payload = msg.get_payload(decode=True)
                            if payload:
                                body = payload.decode('utf-8')
                        
                        print(f"Body: {body.strip()}")
                        is_spam = check_spam(body)
                        spam_status = "Spam" if is_spam else "Not Spam"
                        color = "red" if is_spam else "green"
                        print(colored(f"Spam Status: {spam_status}", color))
            else:
                break
        mail.logout()
    except Exception as e:
        print(f"Error fetching emails: {e}")
def get_credentials():
    email_address = input("Enter your Gmail email address: ")
    password = getpass.getpass("Enter your password: ")
    return email_address, password
def main():
    email_address, password = get_credentials()
    fetch_emails(email_address, password, count=5)

if __name__ == "__main__":
    main()


