In [1]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /home/inflaton/code/CrediNews


## Necessary imports

# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [2]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
        "test": "dataset/test_data.csv",
        "rewritten_train": [
            "dataset/rewritten_train_data_1.csv",
            "dataset/rewritten_train_data_2.csv",
            "dataset/rewritten_train_data_3.csv",
            "dataset/rewritten_train_data_4.csv",
        ],
        "rewritten_test": "dataset/rewritten_test_data.csv",
    },
)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
    rewritten_train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    rewritten_test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
})

In [3]:
import pandas as pd

train_data = datasets["train"].to_pandas()
val_data = datasets["test"].to_pandas()
data = pd.concat([train_data, val_data], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


### Create model function

In [4]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV,
    train_test_split,
)
from cuml.ensemble import RandomForestClassifier  # Add this
from tqdm import tqdm
import numpy as np
import random


def create_rf_model(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
):
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42,
        n_streams=1,  # for reproducibility
    )
    return model

### Train model

In [5]:
def train_model(
    train_data,
    val_data,
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    grid_search=False,
):
    """Trains a Random Forest model using TF-IDF embeddings, following the same structure as the RNN notebook."""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.utils import shuffle

    seed = 42
    random.seed(seed)
    np.random.seed(seed)

    vocab_size = 3000

    print(
        f"\n🚀 Training Random Forest with n_estimators={n_estimators}, max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}, max_features={max_features}, bootstrap={bootstrap}"
    )

    # Extract texts and labels
    train_texts = train_data["processed_full_content"]
    val_texts = val_data["processed_full_content"]
    y_train = train_data["label"]
    y_val = val_data["label"]

    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=vocab_size)

    # Convert sparse matrices to dense arrays
    X_train = (
        vectorizer.fit_transform(train_texts).toarray().astype("float32")
    )  # <-- Add .astype()
    X_val = vectorizer.transform(val_texts).toarray().astype("float32")

    # Ensure labels are NumPy arrays
    y_train = train_data["label"].to_numpy().ravel()
    y_val = val_data["label"].to_numpy().ravel()

    # Create and train the Random Forest model
    model = create_rf_model(
        n_estimators,
        max_depth,
        min_samples_split,
        min_samples_leaf,
        max_features,
        bootstrap,
    )
    model.fit(X_train, y_train)

    # Evaluate model performance on validation data
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    # Print evaluation metrics
    result = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }
    print("\n🏆 Training Results:")

    for key, value in result.items():
        print(f"🔹 {key.capitalize()}: {value:.4f}")

    return result if grid_search else model

### Evaluate model

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


def evaluate_model(model, train_data, val_data):
    """Evaluates the trained cuML Random Forest model."""
    print("Evaluating Model")

    vocab_size = 3000

    # Use the SAME vectorizer as training (or refit)
    train_texts = train_data["processed_full_content"]
    vectorizer = TfidfVectorizer(max_features=vocab_size)
    vectorizer.fit(train_texts)  # Refitting is okay if train_data is the same

    # Convert validation data to dense float32
    val_texts = val_data["processed_full_content"]
    X_val = (
        vectorizer.transform(val_texts).toarray().astype(np.float32)
    )  # <-- Dense + float32

    # Labels as numpy arrays
    y_val = val_data["label"].to_numpy().ravel()

    # Predict
    y_pred = model.predict(X_val)  # Now compatible with cuML

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

### Gridsearch function

In [7]:
def do_grid_search(data):
    """Performs RandomizedSearchCV for cuML RandomForest."""
    from sklearn.model_selection import RandomizedSearchCV

    param_dist = {
        "n_estimators": [50, 100, 200],  # Reduced from [100, 200, 300, 500]
        "max_depth": [10, 20, 30],  # Reduced from [None, 10, 20, 30]
        "min_samples_split": [2, 5],
        "max_features": ["sqrt"],
    }

    # 1. Convert data to dense arrays
    vectorizer = TfidfVectorizer(max_features=3000)
    train_texts = data["processed_full_content"]

    # Convert TF-IDF output to dense arrays (critical for cuML)
    X = vectorizer.fit_transform(train_texts).toarray().astype("float32")
    y = data["label"].to_numpy().ravel()

    # 2. Remove `n_jobs` (cuML uses GPU parallelism)
    search = RandomizedSearchCV(
        RandomForestClassifier(random_state=42, n_streams=1),  # <-- Add n_streams=1
        param_distributions=param_dist,
        n_iter=5,  # Reduce if you get OOM errors
        cv=5,
        scoring="f1",
        random_state=42,
    )

    search.fit(X, y)
    print("Best parameters found:", search.best_params_)
    return search.best_params_

### Perform gridsearch

In [8]:
%%time

best_params = do_grid_search(data)
best_params

Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20}
CPU times: user 51.5 s, sys: 19.4 s, total: 1min 10s
Wall time: 55.8 s


{'n_estimators': 200,
 'min_samples_split': 2,
 'max_features': 'sqrt',
 'max_depth': 20}

In [9]:
%%time

model = train_model(train_data, val_data, **best_params)
model


🚀 Training Random Forest with n_estimators=200, max_depth=20, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, bootstrap=True

🏆 Training Results:
🔹 Accuracy: 0.9494
🔹 Precision: 0.9415
🔹 Recall: 0.9430
🔹 F1_score: 0.9422
CPU times: user 8.93 s, sys: 1.96 s, total: 10.9 s
Wall time: 10.4 s


In [30]:
import joblib

# ✅ Save the trained model
joblib.dump(model, "results/RF_model_original.pkl")

print("✅ Model and vectorizer saved successfully!")

✅ Model and vectorizer saved successfully!


In [31]:
model2 = joblib.load("results/RF_model_original.pkl")

print("✅ Model and vectorizer loaded successfully!")

✅ Model and vectorizer loaded successfully!


In [32]:
%%time

evaluate_model(model, train_data, val_data)

Evaluating Model
Accuracy: 0.9494
Precision: 0.9415
Recall: 0.9430
F1: 0.9422
CPU times: user 6.79 s, sys: 186 ms, total: 6.97 s
Wall time: 6.81 s


In [33]:
%%time

evaluate_model(model2, train_data, val_data)

Evaluating Model
Accuracy: 0.9494
Precision: 0.9415
Recall: 0.9430
F1: 0.9422
CPU times: user 6.68 s, sys: 106 ms, total: 6.79 s
Wall time: 6.2 s


In [13]:
val_data_rewritten = datasets["rewritten_test"].to_pandas()
train_data_rewritten = datasets["rewritten_train"].to_pandas()
data_rewritten = pd.concat(
    [train_data, train_data_rewritten, val_data, val_data_rewritten], ignore_index=True
)
data_rewritten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120982 entries, 0 to 120981
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   label                   120982 non-null  int64 
 1   full_content            120982 non-null  object
 2   processed_full_content  120982 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.8+ MB


In [34]:
%%time

evaluate_model(model, train_data, val_data_rewritten)

Evaluating Model
Accuracy: 0.8491
Precision: 0.8197
Recall: 0.8398
F1: 0.8296
CPU times: user 6.51 s, sys: 174 ms, total: 6.68 s
Wall time: 6.11 s


In [35]:
%%time

evaluate_model(model2, train_data, val_data_rewritten)

Evaluating Model
Accuracy: 0.8491
Precision: 0.8197
Recall: 0.8398
F1: 0.8296
CPU times: user 6.44 s, sys: 187 ms, total: 6.62 s
Wall time: 6 s


In [16]:
%%time

best_params_combined = do_grid_search(data_rewritten)
best_params_combined

Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20}
CPU times: user 1min 7s, sys: 29.6 s, total: 1min 36s
Wall time: 1min 23s


{'n_estimators': 200,
 'min_samples_split': 2,
 'max_features': 'sqrt',
 'max_depth': 20}

In [17]:
%%time

train_data_combined = pd.concat([train_data, train_data_rewritten], ignore_index=True)
val_data_combined = pd.concat([val_data, val_data_rewritten], ignore_index=True)

CPU times: user 17.6 ms, sys: 972 μs, total: 18.6 ms
Wall time: 17.6 ms


In [18]:
model_combined = train_model(
    train_data_combined, val_data_combined, **best_params_combined
)


🚀 Training Random Forest with n_estimators=200, max_depth=20, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, bootstrap=True

🏆 Training Results:
🔹 Accuracy: 0.9058
🔹 Precision: 0.9028
🔹 Recall: 0.8793
🔹 F1_score: 0.8909


In [36]:
joblib.dump(model_combined, "results/RF_model_combined.pkl")

['results/RF_model_combined.pkl']

In [20]:
evaluate_model(model_combined, train_data_combined, val_data_combined)

Evaluating Model
Accuracy: 0.9058
Precision: 0.9028
Recall: 0.8793
F1: 0.8909


In [21]:
evaluate_model(model_combined, train_data_combined, val_data)

Evaluating Model
Accuracy: 0.9383
Precision: 0.9441
Recall: 0.9131
F1: 0.9284


In [22]:
evaluate_model(model_combined, train_data_combined, val_data_rewritten)

Evaluating Model
Accuracy: 0.8732
Precision: 0.8621
Recall: 0.8455
F1: 0.8537


In [23]:
data_rewritten = pd.concat(
    [train_data_rewritten, val_data_rewritten], ignore_index=True
)
data_rewritten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [24]:
%%time

best_params_rewritten = do_grid_search(data_rewritten)
best_params_rewritten

Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20}
CPU times: user 47.6 s, sys: 17.9 s, total: 1min 5s
Wall time: 54.2 s


{'n_estimators': 200,
 'min_samples_split': 2,
 'max_features': 'sqrt',
 'max_depth': 20}

In [25]:
%%time

model_rewritten = train_model(
    train_data_rewritten, val_data_rewritten, **best_params_rewritten
)


🚀 Training Random Forest with n_estimators=200, max_depth=20, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, bootstrap=True

🏆 Training Results:
🔹 Accuracy: 0.8714
🔹 Precision: 0.8652
🔹 Recall: 0.8364
🔹 F1_score: 0.8506
CPU times: user 7.32 s, sys: 2.06 s, total: 9.37 s
Wall time: 9.17 s


In [37]:
joblib.dump(model_rewritten, "results/RF_model_rewritten.pkl")

['results/RF_model_rewritten.pkl']

In [27]:
evaluate_model(model_rewritten, train_data_rewritten, val_data_combined)

Evaluating Model
Accuracy: 0.8785
Precision: 0.9035
Recall: 0.8087
F1: 0.8535


In [28]:
evaluate_model(model_rewritten, train_data_rewritten, val_data_rewritten)

Evaluating Model
Accuracy: 0.8714
Precision: 0.8652
Recall: 0.8364
F1: 0.8506


In [29]:
evaluate_model(model_rewritten, train_data_rewritten, val_data)

Evaluating Model
Accuracy: 0.8856
Precision: 0.9486
Recall: 0.7809
F1: 0.8566
