In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Helper functions to load text and labels
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def load_label_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return np.array([int(line.strip()) for line in file])

# Paths to the dataset
train_text_path = '/Users/xuqianlong/Downloads/train_text.txt'
train_labels_path = '/Users/xuqianlong/Downloads/train_labels.txt'
test_text_path = '/Users/xuqianlong/Downloads/test_text.txt'
test_labels_path = '/Users/xuqianlong/Downloads/test_labels.txt'
val_text_path = '/Users/xuqianlong/Downloads/val_text.txt'
val_labels_path = '/Users/xuqianlong/Downloads/val_labels.txt'

# Load the dataset
train_text = load_text_file(train_text_path)
train_labels = load_label_file(train_labels_path)
test_text = load_text_file(test_text_path)
test_labels = load_label_file(test_labels_path)
val_text = load_text_file(val_text_path)
val_labels = load_label_file(val_labels_path)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
X_val = vectorizer.transform(val_text)

# Encode the labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)
y_val = label_encoder.transform(val_labels)

# Initialize the classifiers
rf_classifier = RandomForestClassifier(random_state=42)
nb_classifier = MultinomialNB()
tree_classifier = DecisionTreeClassifier(random_state=42)

# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)
# Predict on validation set
rf_val_predictions = rf_classifier.predict(X_val)
# Predict on test set
rf_test_predictions = rf_classifier.predict(X_test)

# Train the Naive Bayes classifier
nb_classifier.fit(X_train, y_train)
# Predict on validation set
nb_val_predictions = nb_classifier.predict(X_val)
# Predict on test set
nb_test_predictions = nb_classifier.predict(X_test)

# Train the Decision Tree classifier
tree_classifier.fit(X_train, y_train)
# Predict on validation set
tree_val_predictions = tree_classifier.predict(X_val)
# Predict on test set
tree_test_predictions = tree_classifier.predict(X_test)

# Calculate accuracy and loss for validation set
rf_val_accuracy = accuracy_score(y_val, rf_val_predictions)
rf_val_loss = log_loss(y_val, rf_classifier.predict_proba(X_val))
nb_val_accuracy = accuracy_score(y_val, nb_val_predictions)
nb_val_loss = log_loss(y_val, nb_classifier.predict_proba(X_val))
tree_val_accuracy = accuracy_score(y_val, tree_val_predictions)
tree_val_loss = log_loss(y_val, tree_classifier.predict_proba(X_val))

# Calculate accuracy and loss for test set
rf_test_accuracy = accuracy_score(y_test, rf_test_predictions)
rf_test_loss = log_loss(y_test, rf_classifier.predict_proba(X_test))
nb_test_accuracy = accuracy_score(y_test, nb_test_predictions)
nb_test_loss = log_loss(y_test, nb_classifier.predict_proba(X_test))
tree_test_accuracy = accuracy_score(y_test, tree_test_predictions)
tree_test_loss = log_loss(y_test, tree_classifier.predict_proba(X_test))

results = {
    'Random Forest': {
        'Validation Accuracy': rf_val_accuracy,
        'Validation Loss': rf_val_loss,
        'Test Accuracy': rf_test_accuracy,
        'Test Loss': rf_test_loss
    },
    'Naive Bayes': {
        'Validation Accuracy': nb_val_accuracy,
        'Validation Loss': nb_val_loss,
        'Test Accuracy': nb_test_accuracy,
        'Test Loss': nb_test_loss
    },
    'Decision Tree': {
        'Validation Accuracy': tree_val_accuracy,
        'Validation Loss': tree_val_loss,
        'Test Accuracy': tree_test_accuracy,
        'Test Loss': tree_test_loss
    }
}

print(results)




{'Random Forest': {'Validation Accuracy': 0.626, 'Validation Loss': 0.8246050087879879, 'Test Accuracy': 0.5376098990556822, 'Test Loss': 1.0349028877597215}, 'Naive Bayes': {'Validation Accuracy': 0.622, 'Validation Loss': 1.0731359562009635, 'Test Accuracy': 0.5814881146206448, 'Test Loss': 1.0224157448034774}, 'Decision Tree': {'Validation Accuracy': 0.558, 'Validation Loss': 15.931785212616285, 'Test Accuracy': 0.511234125691957, 'Test Loss': 17.617020615648034}}


