In [None]:
import sys
sys.path.append('../src') 
import os
import joblib
import pandas as pd

# Project modules
from data_preprocessing import load_and_preprocess_data, random_forest_feature_selection, create_specific_dataframe
from model import create_binary_classifier, train_other_classes, ClassFinalClassifier
from model_utils import generate_report, save_reports

In [None]:
def main_function(target_class, n_features_to_select, sampling_method, model_type, output_dir):
    """
    Main function that performs data loading, preprocessing, model training, evaluation, and reporting.
    """
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 1. Data loading and preprocessing
    train_data_raw, X, y, X_train, X_test, y_train, y_test = load_and_preprocess_data('../data/raw/train.csv', target_class, test_size=0.2, random_state=42)

    # 2. Feature selection
    selected_features, X_selected = random_forest_feature_selection(X, y, n_features_to_select)

    # 3. Creating target class numeric data
    y_train_target = create_specific_dataframe(train_data_raw, target_class, target_column='target')

    # 4. Create the classifier
    classifier = create_binary_classifier(X_selected, y_train_target, 1, sampling_method=sampling_method)
    classifier_others = train_other_classes(X_selected, y_train_target, class_value=1, model_type=model_type)

    # 5. Model ensemble
    class_final_clf_model = ClassFinalClassifier(classifier, classifier_others)
    class_final_clf_model.fit(X_selected, y_train_target)

    # 6. Get predictions from each model and save
    predictions_class1 = classifier.predict(X_test[selected_features])
    predictions_others = classifier_others.predict(X_test[selected_features])
    final_predictions = class_final_clf_model.predict(X_test[selected_features])

    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame({
        "Actual": y_test,
        "Predictions_Class1": predictions_class1,
        "Predictions_Others": predictions_others,
        "Final_Predictions": final_predictions
    })

    # 7. Evaluate the model and generate report
    # Passing the model name (e.g., target_class) to generate the report
    generate_report(y_test, final_predictions, model_name=target_class)

    # 8. Check for overfitting
    train_accuracy = class_final_clf_model.score(X_train[selected_features], y_train)
    test_accuracy = class_final_clf_model.score(X_test[selected_features], y_test)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)

    print("\n")

    # 9. Cross-validation
    cv_scores = cross_val_score(class_final_clf_model, X_train[selected_features], y_train, cv=5, scoring='accuracy')
    print("Cross-Validation Scores:", cv_scores)
    print("Average CV Score:", cv_scores.mean())

    # 10. Save the reports
    generate_report(y_test, final_predictions, model_name=target_class, output_dir=output_dir)

    return class_final_clf_model, selected_features

In [None]:
class_strategies = {
    'Class_1': {'n_features_to_select': 15, 'sampling_method': 'smote', 'model_type': 'random_forest'},
    'Class_2': {'n_features_to_select': 15, 'sampling_method': 'under_sampling', 'model_type': 'random_forest'},
    'Class_3': {'n_features_to_select': 12, 'sampling_method': 'none', 'model_type': 'random_forest'},
    'Class_4': {'n_features_to_select': 12, 'sampling_method': 'smote', 'model_type': 'xgboost'},
    'Class_5': {'n_features_to_select': 11, 'sampling_method': 'smote', 'model_type': 'xgboost'},
    'Class_6': {'n_features_to_select': 13, 'sampling_method': 'smote', 'model_type': 'xgboost'},
    'Class_7': {'n_features_to_select': 10, 'sampling_method': 'random', 'model_type': 'random_forest'},
    'Class_8': {'n_features_to_select': 12, 'sampling_method': 'none', 'model_type': 'random_forest'},
    'Class_9': {'n_features_to_select': 15, 'sampling_method': 'under_sampling', 'model_type': 'xgboost'}
}

# Create directories if they don't exist
models_dir = '../models'
predictions_dir = '../predictions'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
if not os.path.exists(predictions_dir):
    os.makedirs(predictions_dir)

# Loop to run main_function for each target class
predictions_dict = {}  # Dictionary to store predictions

for target_class, strategy in class_strategies.items():
    print(f"Running main_function for {target_class}...")
    
    # Train the model
    model, selected_features = main_function(
        target_class=target_class,
        n_features_to_select=strategy['n_features_to_select'],
        sampling_method=strategy['sampling_method'],
        model_type=strategy['model_type'],
        output_dir=predictions_dir
    )
    
    model_filename = f"{target_class}_{strategy['model_type']}_model.pkl"
    model_filepath = os.path.join(models_dir, model_filename)
    joblib.dump(model, model_filepath)
    print(f"Model for {target_class} has been trained and saved as {model_filepath}.\n")

In [None]:
import pandas as pd
import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# 1. Load models (previously saved models)
model_paths = [
    '../models/Class_1_random_forest_model.pkl', 
    '../models/Class_2_random_forest_model.pkl', 
    '../models/Class_3_random_forest_model.pkl', 
    '../models/Class_4_xgboost_model.pkl', 
    '../models/Class_5_xgboost_model.pkl', 
    '../models/Class_6_xgboost_model.pkl', 
    '../models/Class_7_random_forest_model.pkl', 
    '../models/Class_8_random_forest_model.pkl', 
    '../models/Class_9_xgboost_model.pkl'
]

# Load models
models = [joblib.load(path) for path in model_paths]

# 2. Load training and test datasets
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

# Separate features and target variable
X_train = train_df.drop(columns=['id', 'target'])
y_train = train_df['target']
X_test = test_df.drop(columns=['id'])

# 3. Define the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf1', models[0]), ('rf2', models[1]), ('rf3', models[2]), 
                ('xgb1', models[3]), ('xgb2', models[4]), ('xgb3', models[5]), 
                ('rf4', models[6]), ('rf5', models[7]), ('xgb4', models[8])],
    voting='soft'  # 'soft' uses probabilities
)

# 4. Train the voting classifier
voting_clf.fit(X_train, y_train)

# 5. Make probability predictions on the test set
y_pred_proba = voting_clf.predict_proba(X_test)

# 6. Save results in sampleSubmission.csv format
sample_submission = pd.read_csv('../data/raw/sampleSubmission.csv')

# Create columns for each class
classes = sample_submission.columns[1:]  # First column is 'id', class names follow
submission_result = sample_submission.copy()

# Place the results correctly
for i, class_name in enumerate(classes):
    submission_result[class_name] = y_pred_proba[:, i]

# Save the results to ../submission folder
submission_path = '../submission/voting_classifier_probabilities.csv'
submission_result.to_csv(submission_path, index=False)

print(f"Voting Classifier Predictions with probabilities saved successfully to {submission_path}!")