In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import joblib

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load your dataset
df = pd.read_csv('Training_data.csv')

# Check the structure of your dataset
print(df.head())

# Preprocessing
X = df['msg']
y = df['label']

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    preprocessed_text = ' '.join(lemmatized_words)
    return preprocessed_text

X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]

tfidf_vectorizer = TfidfVectorizer(max_features=16000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)


joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


ModuleNotFoundError: No module named 'pandas'

In [3]:
from sklearn.svm import SVC
# Initialize and train the Support Vector Machine classifier
Svm = SVC(kernel='linear')
Svm.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = Svm.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

report = classification_report(y_test, y_pred)
print(report)

joblib.dump(Svm, 'svm_model.pkl')

ModuleNotFoundError: No module named 'sklearn'

In [10]:
from sklearn.naive_bayes import MultinomialNB
# Initialize and train the Naive Bayes classifier
Nv = MultinomialNB()
Nv.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = Nv.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report for more detailed metrics
report = classification_report(y_test, y_pred)
print(report)

# Save the trained model and TF-IDF vectorizer for later use
joblib.dump(Nv, 'naive_bayes_model.pkl')


Accuracy: 0.97
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       491
        spam       1.00      0.79      0.88        67

    accuracy                           0.97       558
   macro avg       0.99      0.90      0.93       558
weighted avg       0.98      0.97      0.97       558



['naive_bayes_model.pkl']

In [16]:
from sklearn.tree import DecisionTreeClassifier
Dt = DecisionTreeClassifier()
Dt.fit(X_train_tfidf, y_train)

y_pred = Dt.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

report = classification_report(y_test, y_pred)
print(report)

joblib.dump(Dt, 'decision_tree_model.pkl')


Accuracy: 0.93
              precision    recall  f1-score   support

         ham       0.93      1.00      0.96       491
        spam       1.00      0.43      0.60        67

    accuracy                           0.93       558
   macro avg       0.96      0.72      0.78       558
weighted avg       0.94      0.93      0.92       558



['decision_tree_model.pkl']

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_tfidf, y_train)

y_pred = knn.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

report = classification_report(y_test, y_pred)
print(report)

joblib.dump(knn, 'knn_model.pkl')


Accuracy: 0.93
              precision    recall  f1-score   support

         ham       0.93      1.00      0.96       491
        spam       1.00      0.43      0.60        67

    accuracy                           0.93       558
   macro avg       0.96      0.72      0.78       558
weighted avg       0.94      0.93      0.92       558



['knn_model.pkl']

In [14]:
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

y_pred = rf_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

report = classification_report(y_test, y_pred)
print(report)

# Save the trained Random Forest Classifier model and TF-IDF vectorizer for later use
joblib.dump(rf_classifier, 'rf_model.pkl')


Accuracy: 0.99
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       491
        spam       0.98      0.90      0.94        67

    accuracy                           0.99       558
   macro avg       0.98      0.95      0.96       558
weighted avg       0.99      0.99      0.99       558



['rf_model.pkl']