In [10]:
import pandas as pd
import numpy as np
import pickle
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

print("âœ“ Libraries imported successfully!")

âœ“ Libraries imported successfully!


In [11]:
# Loading cleaned dataset
df = pd.read_csv('../data/processed_data.csv')
print("Dataset loaded")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Label distribution:\n{df['label'].value_counts()}")

Dataset loaded
Shape: (8501, 8)
Columns: ['articleID', 'domain', 'date', 'category', 'label', 'source', 'F-type', 'text']
Label distribution:
label
1    7202
0    1299
Name: count, dtype: int64


In [12]:

with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

print("âœ“ Vectorizer loaded successfully!")


X = vectorizer.transform(df['text'])
y = df['label']

print(f"âœ“ Text data vectorized! Shape: {X.shape}")

âœ“ Vectorizer loaded successfully!
âœ“ Text data vectorized! Shape: (8501, 5000)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("âœ“ Data split completed!")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training labels distribution:\n{y_train.value_counts()}")

âœ“ Data split completed!
Training set: (6800, 5000)
Test set: (1701, 5000)
Training labels distribution:
label
1    5761
0    1039
Name: count, dtype: int64


In [14]:
# Initialize Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,    # Number of trees
    random_state=42,     # For reproducible results
    max_depth=10,        # Prevent overfitting
    n_jobs=-1           # Use all CPU cores
)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("âœ“ Random Forest training completed!")

Training Random Forest model...
âœ“ Random Forest training completed!


In [15]:
# Make predictions
y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)

print(f"ðŸŽ¯ RANDOM FOREST RESULTS:")
print(f"Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")

print("\nðŸ“Š Detailed Classification Report:")
print(classification_report(y_test, y_pred))

# Show confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

ðŸŽ¯ RANDOM FOREST RESULTS:
Accuracy: 0.8912 (89.12%)

ðŸ“Š Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.29      0.45       260
           1       0.89      1.00      0.94      1441

    accuracy                           0.89      1701
   macro avg       0.94      0.64      0.69      1701
weighted avg       0.90      0.89      0.86      1701

Confusion Matrix:
[[  75  185]
 [   0 1441]]


In [16]:
# Save Random Forest model
joblib.dump(rf_model, 'random_forest_model.pkl')
print("âœ“ Random Forest model saved as 'random_forest_model.pkl'")

# Save results
results = {
    'y_true': y_test,
    'y_pred': y_pred,
    'accuracy': rf_accuracy
}

with open('rf_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("âœ“ Results saved!")
print("âœ… Random Forest training COMPLETE!")

âœ“ Random Forest model saved as 'random_forest_model.pkl'
âœ“ Results saved!
âœ… Random Forest training COMPLETE!
