In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
email_data = pd.read_csv('emails.csv')

# Drop the "Email No." column if it exists, as it's not needed
email_data = email_data.drop(columns=['Email No.'])

# Assuming the last column is the target (adjust this if needed)
X = email_data.iloc[:, :-1]  # All columns except the last one
y = email_data.iloc[:, -1]   # The last column as the target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM model with a linear kernel
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classify new email
new_email = pd.DataFrame([{
    'the': 4,  # Example numerical feature values
    'to': 3,
    'ect': 1,
    'a...': 0,
    # Add remaining columns based on your dataset's structure
}])

# Predict the label
new_email_label = svm.predict(new_email)
print("Classification result:", new_email_label)


In [7]:
import os
import shutil
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Create a simple dataset
def create_email_dataset():
    base_dir = 'emails'
    spam_dir = os.path.join(base_dir, 'spam')
    ham_dir = os.path.join(base_dir, 'ham')
    
    # Clear old data if it exists
    if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
    
    # Create directories
    os.makedirs(spam_dir)
    os.makedirs(ham_dir)
    
    # Add spam emails
    spam_emails = [
        "Win a million dollars now!",
        "You have won a free trip to Hawaii.",
        "Get rich quick with this one simple trick."
    ]
    for i, email in enumerate(spam_emails):
        with open(os.path.join(spam_dir, f'spam_{i}.txt'), 'w') as f:
            f.write(email)
    
    # Add ham emails
    ham_emails = [
        "Hey, are we still meeting for lunch tomorrow?",
        "Here is the report you asked for.",
        "Don't forget the meeting at 3 PM."
    ]
    for i, email in enumerate(ham_emails):
        with open(os.path.join(ham_dir, f'ham_{i}.txt'), 'w') as f:
            f.write(email)

# Step 2: Create the dataset
create_email_dataset()

# Step 3: Load the dataset
email_data = datasets.load_files('emails', encoding='utf-8', decode_error='ignore')
X, y = email_data.data, email_data.target

# print("Email data:", email_data)

# Step 4: Preprocess the data using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 6: Train an SVM model with a linear kernel
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Step 8: Classify a new email
new_email = ["This is a free vacation offer!"]
new_email_tfidf = vectorizer.transform(new_email)
new_email_label = svm.predict(new_email_tfidf)
print("Classification result:", "Spam" if new_email_label[0] == 1 else "Not Spam")


Accuracy: 1.0
Classification result: Spam
