In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from xgboost import XGBClassifier

def getAndTrainModel(X_train, y_train):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # Use 'logloss' for binary classification
    model.fit(X_train, y_train)
    return model 

In [6]:
# Loading data from a CSV file
df = pd.read_csv('balanced_rt_reviews_20240207-193333.csv')  # Adjust the filename to your actual file path

# Count the number of NaN or empty strings in 'review_detail' for the entire dataset
missing_or_empty_count = df['review_detail'].isna().sum() + (df['review_detail'] == '').sum()

# Print the count
print("Total missing or empty 'review_detail' values in the dataset:", missing_or_empty_count)

df = df.dropna(subset=['review_detail'])

top_critic_counts = df['top_critic'].value_counts()

print(top_critic_counts)

Total missing or empty 'review_detail' values in the dataset: 7
top_critic
True     32998
False    32995
Name: count, dtype: int64


In [7]:
texts = df['review_detail'].values
top_critics = df['top_critic'].astype(int).values  # Convert boolean to integers (True/False to 1/0)

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, top_critics, test_size=0.2, random_state=42)

# Training a model with XGBoost
model = getAndTrainModel(X_train=X_train, y_train=y_train)

In [8]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
ra = roc_auc_score(y_test, predictions)
print("ROC AUC score:", ra)


# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["False","True"])
print("Classification Report:")
print(report)

Accuracy: 0.5635275399651489
ROC AUC score: 0.5643778699941722
Classification Report:
              precision    recall  f1-score   support

       False       0.58      0.47      0.52      6660
        True       0.55      0.66      0.60      6539

    accuracy                           0.56     13199
   macro avg       0.57      0.56      0.56     13199
weighted avg       0.57      0.56      0.56     13199

