In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('fairjob.csv')

In [3]:
import numpy as np
import pandas as pd
import os

def stratified_split_on_click(df, test_size=0.1, dev_size=0.1, random_state=42):
    """Perform stratified split based on 'click' column into train, dev, and test sets."""
    np.random.seed(random_state)
    
    # Ensure 'click' column exists in the dataframe
    if 'click' not in df.columns:
        raise ValueError("'click' column is missing from the dataframe.")
    
    # Stratify based on 'click' values (0 or 1)
    stratified_indices = {0: df[df['click'] == 0].index.tolist(), 
                          1: df[df['click'] == 1].index.tolist()}
    
    train_indices, dev_indices, test_indices = [], [], []

    for click_value, indices in stratified_indices.items():
        np.random.shuffle(indices)  # Shuffle the indices for stratified splitting
        n_test = int(len(indices) * test_size)
        n_dev = int(len(indices) * dev_size)
        
        # Split data into test, dev, and train
        test_indices.extend(indices[:n_test])
        dev_indices.extend(indices[n_test:n_test + n_dev])
        train_indices.extend(indices[n_test + n_dev:])
    
    # Create the datasets
    train = df.loc[train_indices]
    dev = df.loc[dev_indices]
    test = df.loc[test_indices]
    
    # Check the distribution of 'click' in the splits
    print(f"Train click distribution: {train['click'].value_counts()}")
    print(f"Dev click distribution: {dev['click'].value_counts()}")
    print(f"Test click distribution: {test['click'].value_counts()}")
    
    return train, dev, test



def process_and_save_splits(df, output_dir, split_key="click", test_size=0.1, dev_size=0.1):
    """Apply stratified split and save the results."""
    train, dev, test = stratified_split_on_click(df, test_size=test_size, dev_size=dev_size)

    os.makedirs(output_dir, exist_ok=True)

    for dataset, name in zip([train, dev, test], ['train', 'dev', 'test']):
        dataset.to_csv(f"{output_dir}/{name}_{split_key}.csv", index=False)
        
    print("Datasets saved.")
    

# Example usage
output_directory = "splits"
os.makedirs(output_directory, exist_ok=True)

# Assuming 'df' is already defined or loaded
process_and_save_splits(df, output_directory, split_key="click")


Train click distribution: click
0    851791
1      5993
Name: count, dtype: int64
Dev click distribution: click
0    106473
1       748
Name: count, dtype: int64
Test click distribution: click
0    106473
1       748
Name: count, dtype: int64
Datasets saved.


In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from multinomial_logistic_regression import MultinomialLogisticRegression
from multinomial_naive_bayes import MultinomialNaiveBayes

# Define the function to train and evaluate the models
def train_and_evaluate_models(splits_folder):
    """Train and evaluate both models (Logistic Regression and Naive Bayes) using pre-split datasets."""
    
    # Load the pre-split data (train, dev, test)
    train_data = pd.read_csv(os.path.join(splits_folder, 'train_click.csv'))
    dev_data = pd.read_csv(os.path.join(splits_folder, 'dev_click.csv'))
    test_data = pd.read_csv(os.path.join(splits_folder, 'test_click.csv'))

    # Separate features (X) and target variable (y)
    X_train, y_train = train_data.drop(columns=['click']), train_data['click']
    X_dev, y_dev = dev_data.drop(columns=['click']), dev_data['click']
    X_test, y_test = test_data.drop(columns=['click']), test_data['click']

    # Initialize models
    logistic_model = MultinomialLogisticRegression(learning_rate=0.01, iterations=1000)
    naive_bayes_model = MultinomialNaiveBayes()

    # Train Multinomial Logistic Regression
    logistic_model.fit(X_train, y_train)

    # Predictions using Logistic Regression
    y_pred_logistic_train = logistic_model.predict(X_train)  # Use the full array
    y_pred_logistic_dev = logistic_model.predict(X_dev)
    y_pred_logistic_test = logistic_model.predict(X_test)


    # Train Multinomial Naive Bayes
    naive_bayes_model.fit(X_train, y_train)

    # Predictions using Naive Bayes
    y_pred_naive_bayes_train = naive_bayes_model.predict(X_train)
    y_pred_naive_bayes_dev = naive_bayes_model.predict(X_dev)
    y_pred_naive_bayes_test = naive_bayes_model.predict(X_test)

    # Calculate Accuracy and other metrics
    metrics = {
        'logistic_train': calculate_metrics(y_train, y_pred_logistic_train),
        'logistic_dev': calculate_metrics(y_dev, y_pred_logistic_dev),
        'logistic_test': calculate_metrics(y_test, y_pred_logistic_test),
        'naive_bayes_train': calculate_metrics(y_train, y_pred_naive_bayes_train),
        'naive_bayes_dev': calculate_metrics(y_dev, y_pred_naive_bayes_dev),
        'naive_bayes_test': calculate_metrics(y_test, y_pred_naive_bayes_test)
    }

    # Print metrics
    for model_name, metric in metrics.items():
        print(f"{model_name} Accuracy: {metric['accuracy']:.4f}")
        print(f"{model_name} Precision: {metric['precision']:.4f}")
        print(f"{model_name} Recall: {metric['recall']:.4f}")
        print(f"{model_name} F1-Score: {metric['f1_score']:.4f}")
        print("-" * 40)

    # Optionally save predictions to CSV files
    for split_name, X, y_true, y_pred_logistic, y_pred_naive_bayes in zip(
            ['train', 'dev', 'test'],
            [X_train, X_dev, X_test],
            [y_train, y_dev, y_test],
            [y_pred_logistic_train, y_pred_logistic_dev, y_pred_logistic_test],
            [y_pred_naive_bayes_train, y_pred_naive_bayes_dev, y_pred_naive_bayes_test]):
        result_df = X.copy()
        result_df['click_true'] = y_true
        result_df['logistic_pred'] = y_pred_logistic
        result_df['naive_bayes_pred'] = y_pred_naive_bayes
        result_df.to_csv(f"{splits_folder}/{split_name}_predictions.csv", index=False)

    print("Predictions saved to output directory.")

def calculate_metrics(y_true, y_pred):
    """Calculate accuracy, precision, recall, and F1-score."""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

# Example usage
splits_folder = "splits"  # Path to your splits folder
train_and_evaluate_models(splits_folder)


  self.feature_log_probs[c, :] = np.log((X_c.sum(axis=0) + 1) / (X_c.sum() + n_features))


logistic_train Accuracy: 0.9930
logistic_train Precision: 0.9861
logistic_train Recall: 0.9930
logistic_train F1-Score: 0.9895
----------------------------------------
logistic_dev Accuracy: 0.9930
logistic_dev Precision: 0.9861
logistic_dev Recall: 0.9930
logistic_dev F1-Score: 0.9895
----------------------------------------
logistic_test Accuracy: 0.9930
logistic_test Precision: 0.9861
logistic_test Recall: 0.9930
logistic_test F1-Score: 0.9895
----------------------------------------
naive_bayes_train Accuracy: 0.9930
naive_bayes_train Precision: 0.9861
naive_bayes_train Recall: 0.9930
naive_bayes_train F1-Score: 0.9895
----------------------------------------
naive_bayes_dev Accuracy: 0.9930
naive_bayes_dev Precision: 0.9861
naive_bayes_dev Recall: 0.9930
naive_bayes_dev F1-Score: 0.9895
----------------------------------------
naive_bayes_test Accuracy: 0.9930
naive_bayes_test Precision: 0.9861
naive_bayes_test Recall: 0.9930
naive_bayes_test F1-Score: 0.9895
---------------------