<a href="https://colab.research.google.com/github/imbukwa1/AI-Based-Tomato-Health-Monitoring-For-Farmers/blob/main/email_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


In [None]:
#Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"


In [None]:
df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=["label", "message"])


In [None]:
import requests
import zipfile
import io

# Download the zip file
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

# Extract the zip file
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall()

In [None]:
df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=["label", "message"])
display(df.head())

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#value count
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
#Encode labels (ham = 0, spam = 1) Replaces the string labels with numbers: "ham" → 0, "spam" → 1.
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Train-test split: Splits the dataset into features (X) and labels (y) for training and testing.
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)

In [None]:
 #Vectorize text (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)

In [None]:
#Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.97847533632287

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
#train a decision trees
clf = DecisionTreeClassifier() # Instantiate the classifier
clf.fit(X_train_tfidf, y_train)

#evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9704035874439462

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       966
           1       0.91      0.87      0.89       149

    accuracy                           0.97      1115
   macro avg       0.94      0.93      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
#train a support vector machines
clf = SVC()
clf.fit(X_train_tfidf, y_train)

#evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.9847533632286996

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.99      0.98      0.98      1115



In [None]:
#train a random forest classifier
clf = RandomForestClassifier()
clf.fit(X_train_tfidf, y_train)

#evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.97847533632287

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
#train a k-nearest neighbours
clf = KNeighborsClassifier()
clf.fit(X_train_tfidf, y_train)

#evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.9130044843049328

Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       966
           1       1.00      0.35      0.52       149

    accuracy                           0.91      1115
   macro avg       0.95      0.67      0.73      1115
weighted avg       0.92      0.91      0.89      1115



In [None]:
#train a LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

#evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.9695067264573991

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
#compare the different metrics
# Create a DataFrame to store evaluation metrics
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Assuming you have trained and evaluated the following classifiers in previous cells:
# Naive Bayes (clf_nb), Decision Tree (clf_dt), Support Vector Machine (clf_svm),
# Random Forest (clf_rf), K-Nearest Neighbors (clf_knn), Logistic Regression (clf_lr)

# For each classifier, calculate the metrics and add them to the DataFrame
models = {
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machines': SVC(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbours': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression()
}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train_tfidf, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test_tfidf)
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['1']['precision'] # Precision for the positive class (spam)
    recall = report['1']['recall']     # Recall for the positive class (spam)
    f1_score = report['1']['f1-score']   # F1-score for the positive class (spam)

    # Add metrics to the DataFrame
    metrics_df = pd.concat([metrics_df, pd.DataFrame([{
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1_score
    }])], ignore_index=True)


# Display the DataFrame
display(metrics_df)

# Analyze the DataFrame to identify the best performing model
best_model = metrics_df.loc[metrics_df['Accuracy'].idxmax()]
print("\nBest Performing Model:")
print(best_model)

# Analyze the DataFrame to identify the best performing model
worst_model = metrics_df.loc[metrics_df['Accuracy'].idxmin()]
print("\nWorst Performing Model:")
print(worst_model)

  metrics_df = pd.concat([metrics_df, pd.DataFrame([{


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Naive Bayes,0.978475,1.0,0.838926,0.912409
1,Decision Tree,0.966816,0.91791,0.825503,0.869258
2,Support Vector Machines,0.984753,1.0,0.885906,0.939502
3,Random Forest,0.977578,1.0,0.832215,0.908425
4,K-Nearest Neighbours,0.913004,1.0,0.348993,0.517413
5,Logistic Regression,0.969507,1.0,0.771812,0.871212



Best Performing Model:
Model        Support Vector Machines
Accuracy                    0.984753
Precision                        1.0
Recall                      0.885906
F1-Score                    0.939502
Name: 2, dtype: object

Worst Performing Model:
Model        K-Nearest Neighbours
Accuracy                 0.913004
Precision                     1.0
Recall                   0.348993
F1-Score                 0.517413
Name: 4, dtype: object


Summary