In [1]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

test_file_path = 'Independent_dataset.csv'
df_test = pd.read_csv(test_file_path)

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    preprocessed_text = ' '.join(lemmatized_words)
    return preprocessed_text

X_test = df_test['msg']

true_labels = df_test['label']


X_test_preprocessed = [preprocess_text(text) for text in X_test]

# Transform the preprocessed test text data into TF-IDF vectors
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
svm_model = joblib.load('svm_model.pkl')
svm_predictions = svm_model.predict(X_test_tfidf)


accuracy = accuracy_score(true_labels, svm_predictions)
precision = precision_score(true_labels, svm_predictions, average='weighted')
recall = recall_score(true_labels, svm_predictions, average='weighted')
f1 = f1_score(true_labels, svm_predictions, average='weighted')

print(f'Accuracy on the independent dataset: {accuracy:.2f}')
print(f'Precision on the independent dataset: {precision:.2f}')
print(f'Recall on the independent dataset: {recall:.2f}')
print(f'F1 score on the independent dataset: {f1:.2f}')

Accuracy on the independent dataset: 0.64
Precision on the independent dataset: 0.82
Recall on the independent dataset: 0.64
F1 score on the independent dataset: 0.62


In [None]:
naive_bayes_model = joblib.load('naive_bayes_model.pkl')
nv_predictions = naive_bayes_model.predict(X_test_tfidf)


accuracy = accuracy_score(true_labels, nv_predictions)
precision = precision_score(true_labels, nv_predictions, average='weighted')
recall = recall_score(true_labels, nv_predictions, average='weighted')
f1 = f1_score(true_labels, nv_predictions, average='weighted')

print(f'Accuracy on the independent dataset: {accuracy:.2f}')
print(f'Precision on the independent dataset: {precision:.2f}')
print(f'Recall on the independent dataset: {recall:.2f}')
print(f'F1 score on the independent dataset: {f1:.2f}')




Accuracy on the independent dataset: 0.45
Precision on the independent dataset: 0.78
Recall on the independent dataset: 0.45
F1 score on the independent dataset: 0.37


In [4]:
decision_tree_model = joblib.load('decision_tree_model.pkl')
dt_predictions = decision_tree_model.predict(X_test_tfidf)


accuracy = accuracy_score(true_labels, dt_predictions)
precision = precision_score(true_labels, dt_predictions, average='weighted', zero_division=0)
recall = recall_score(true_labels, dt_predictions, average='weighted')
f1 = f1_score(true_labels, dt_predictions, average='weighted')

print(f'Accuracy on the independent dataset: {accuracy:.2f}')
print(f'Precision on the independent dataset: {precision:.2f}')
print(f'Recall on the independent dataset: {recall:.2f}')
print(f'F1 score on the independent dataset: {f1:.2f}')

Accuracy on the independent dataset: 0.36
Precision on the independent dataset: 0.13
Recall on the independent dataset: 0.36
F1 score on the independent dataset: 0.19


In [5]:
knn_model = joblib.load('knn_model.pkl')
knn_predictions = knn_model.predict(X_test_tfidf)


accuracy = accuracy_score(true_labels, knn_predictions)
precision = precision_score(true_labels, dt_predictions, average='weighted', zero_division=0)
recall = recall_score(true_labels, knn_predictions, average='weighted')
f1 = f1_score(true_labels, knn_predictions, average='weighted')

print(f'Accuracy on the test dataset: {accuracy:.2f}')
print(f'Precision on the test dataset: {precision:.2f}')
print(f'Recall on the test dataset: {recall:.2f}')
print(f'F1 score on the test dataset: {f1:.2f}')

Accuracy on the test dataset: 0.36
Precision on the test dataset: 0.13
Recall on the test dataset: 0.36
F1 score on the test dataset: 0.19


In [6]:
rf_model = joblib.load('rf_model.pkl')
rf_predictions = rf_model.predict(X_test_tfidf)


accuracy = accuracy_score(true_labels, rf_predictions)
precision = precision_score(true_labels, rf_predictions, average='weighted', zero_division=0)
recall = recall_score(true_labels, rf_predictions, average='weighted')
f1 = f1_score(true_labels, rf_predictions, average='weighted')

print(f'Accuracy on the test dataset: {accuracy:.2f}')
print(f'Precision on the test dataset: {precision:.2f}')
print(f'Recall on the test dataset: {recall:.2f}')
print(f'F1 score on the test dataset: {f1:.2f}')

Accuracy on the test dataset: 0.36
Precision on the test dataset: 0.13
Recall on the test dataset: 0.36
F1 score on the test dataset: 0.19
