In [3]:
# 24 November 2023
# CSC461 – Assignment3 – Machine Learning
# Komal Khizar
# FA20-BSE-096
# This task entails applying a Random Forest classifier to a gender prediction dataset,
# using two cross-validation techniques: Monte Carlo cross-validation and Leave P-Out cross-validation.
#  The focus is to evaluate and report the F1 scores for each method,
#  with freedom to select specific parameters for the cross-validation strategies.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, ShuffleSplit, LeavePOut
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
data = pd.read_csv('gender-prediction.csv')

# Preprocess the data: Encode categorical variables and scale data
label_encoders = {}
for column in ['beard', 'hair_length', 'scarf', 'eye_color']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

X = data.drop('gender', axis=1)
y = data['gender']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Random Forest Classifier
rf_classifier = RandomForestClassifier()

# F1 scorer for multi-class classification
f1_scorer = make_scorer(f1_score, average='weighted')

# Monte Carlo Cross-Validation
mccv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=42)
mccv_scores = cross_val_score(rf_classifier, X_scaled, y, cv=mccv, scoring=f1_scorer)

# Leave-P-Out Cross-Validation (Reduced P and Limited Iterations)
p = 2  # Smaller value of P
lpo = LeavePOut(p)
max_iterations = 100  # Limit the number of iterations for large datasets
lpo_scores = []
iteration = 0

for train_index, test_index in lpo.split(X_scaled):
    if iteration >= max_iterations:
        break
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    lpo_scores.append(f1_score(y_test, y_pred, average='weighted'))

    iteration += 1

# Average F1 scores
mccv_avg_f1 = mccv_scores.mean()
lpo_avg_f1 = sum(lpo_scores) / len(lpo_scores)

print("Average F1 Score with Monte Carlo Cross-Validation:", mccv_avg_f1)
print("Average F1 Score with Leave-P-Out Cross-Validation:", lpo_avg_f1)


Average F1 Score with Monte Carlo Cross-Validation: 0.9742129630644399
Average F1 Score with Leave-P-Out Cross-Validation: 0.9766666666666666
