Load the required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import FunctionTransformer

import warnings
warnings.filterwarnings('ignore')

- load the data,<br>
  when using pd.read_csv, make sure to specify:<br>
  na_values=['?']<br>
  as the data has many missing values coded as '?'<br>
  this setting will turn them into NaNs, which makes subsequent processing a lot easier<br>
  split train.csv into X_train and y_train, using the last feature 'Class' for y_train<br>
  split test.csv the same way

In [None]:
# Load the data with missing values treated as NaN
train_data = pd.read_csv('https://raw.githubusercontent.com/bpfa/data_for_compx310_2023/main/lab10/train.csv', na_values=['?'])
test_data = pd.read_csv('https://raw.githubusercontent.com/bpfa/data_for_compx310_2023/main/lab10/test.csv', na_values=['?'])

# Split train data into X_train and y_train
X_train = train_data.drop('Class', axis=1)  # X_train contains all columns except 'Class'
y_train = train_data['Class']

# Split test data the same way
X_test = test_data.drop('Class', axis=1)  # X_test contains all columns except 'Class'
y_test = test_data['Class']

  this time we want to use pipelines for all processing, to make sure
  that train and test (and validation splits and cross-validation)
  all see correctly processed data in the same way

  So will need need to setup one pipeline for categorical features
  combining a SimpleImputer using a constant value of "missing"
  with a OneHotEncoder
  
  one pipeline for numeric features
  combining a SimpleImputer using the "mean" for imputation
  with a StandardScalar

  these two pipelines can be put together using a
  sklearn.compose.ColumnTransformer

  finally you will then combine this preprocesser
  with different classifiers, again using a pipeline (see below)


In [None]:

categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

# setup one pipeline for categorical features combining a SimpleImputer using a constant value of "missing" with a OneHotEncoder
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])





# setup one pipeline for numeric features combining a SimpleImputer using the "mean" for imputation with a StandardScaler
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

- define a function repeated_tt(X, y, classifier, num_repeats, random_seed, val_fraction)

  This function will split X into train + val (using sklearn.model_selection.train_test_split) num_repeats times, fit the classifier on train and use accuracy_score to compute the accuracy on the val set<br>
  this function will return the mean value of all num_repeats validation accuracies
  <br>
  to be sure that you get different random splits each time, always specify
  random_state=random_seed + i, where i is in range(0, num_repeats)

In [None]:
# with which to convert numpy arrays to DataFrames
def convert_to_dataframe(X):
    return pd.DataFrame(X)

# define a function repeated_tt(X, y, classifier, num_repeats, random_seed, val_fraction)
def repeated_tt(X, y, classifier, num_repeats, random_seed, val_fraction):
    accuracy_scores = []

    # This function will split X into train + val (using sklearn.model_selection.train_test_split) num_repeats times
    for i in range(num_repeats):

        # to be sure that you get different random splits each time, always specify random_state=random_seed + i, where i is in range(0, num_repeats)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_fraction, random_state=random_seed + i)

        # Convert the transformed arrays back to DataFrames for the classifier
        X_train_df = convert_to_dataframe(X_train)
        X_val_df = convert_to_dataframe(X_val)

        # fit the classifier on train
        pipe = Pipeline([('clf', classifier)])
        pipe.fit(X_train_df, y_train)
        y_pred = pipe.predict(X_val_df)

        # use accuracy_score to compute the accuracy on the val set
        accuracy = accuracy_score(y_val, y_pred)
        accuracy_scores.append(accuracy)

    # this function will return the mean value of all num_repeats validation accuracies
    return np.mean(accuracy_scores)


- define a function repeated_cv10(X, y, classifier, num_repeats, random_seed)<br>
  This function will use sklearn.model_selection.cross_val_score to run 10-fold cross-validation num_repeats times<br>
  cross_val_score returns 10 accuracy scores that you should average into one value<br>
  This function will return the mean accuracy over the num_repeats 10-fold cv accuracy means, ie. a mean of means<br>
  cross_val_score does NOT shuffle the data automatically, so you will need to use sklearn.model_selection.StratifiedKFold to specify data shuffling explicitly as an argument to cross_val_score(..., cv = StratifiedKFold(10, True, random_seed+i) ...) where again i is in range(0, num_repeats)

In [None]:
# define a function repeated_cv10(X, y, classifier, num_repeats, random_seed)
def repeated_cv10(X, y, classifier, num_repeats, random_seed):
    accuracy_scores = []

    # use sklearn.model_selection.StratifiedKFold to specify data shuffling explicitly as an argument to cross_val_score
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)






    for i in range(num_repeats):
        # Convert X to a DataFrame
        X = convert_to_dataframe(X)

        # Convert the transformed array back to a DataFrame for the classifier
        X_df = convert_to_dataframe(X)

        pipe = Pipeline([('clf', classifier)])

        # This function will use sklearn.model_selection.cross_val_score to run 10-fold cross-validation num_repeats times
        scores = cross_val_score(pipe, X_df, y, cv=skf)
        accuracy_scores.append(np.mean(scores))

    # cross_val_score returns 10 accuracy scores that you should average into one value
    return np.mean(accuracy_scores)


- then you will compare a diverse set of classifiers:<br>
  LogisticRegression (with default settings)<br>
  RandomForestClassifier with max_features in [5, "sqrt", 25]<br>
  GaussianNB (with default settings)<br>
  KNeighborsClassifier with n_neighbors in [1, 5, 25]<br><br>
  for these 8 classifiers, always setup a pipeline<br>
  pipe_XX = Pipeline([('preproc', my_column_transformer), ('XX', XX(...))])<br>
  where XX is the respective classifier,<br> e.g.
  pipe_rf_5 = Pipeline([('preproc', my_column_transformer), ('rf_5', RandomForestClassifier(max_features=5))])

In [None]:
# compare a diverse set of classifiers
classifiers = {
    # LogisticRegression (with default settings)
    'Logistic Regression': LogisticRegression(),

    # RandomForestClassifier with max_features in [5, "sqrt", 25]
    'Random Forest (max_features=5)': RandomForestClassifier(max_features=5),
    'Random Forest (max_features=sqrt)': RandomForestClassifier(max_features='sqrt'),
    'Random Forest (max_features=25)': RandomForestClassifier(max_features=25),

    # GaussianNB (with default settings)
    'Gaussian Naive Bayes': GaussianNB(),

    # KNeighborsClassifier with n_neighbors in [1, 5, 25]
    'K-Nearest Neighbors (n_neighbors=1)': KNeighborsClassifier(n_neighbors=1),
    'K-Nearest Neighbors (n_neighbors=5)': KNeighborsClassifier(n_neighbors=5),
    'K-Nearest Neighbors (n_neighbors=25)': KNeighborsClassifier(n_neighbors=25)
}

# for these 8 classifiers, always setup a pipeline
classifier_pipelines = {}
for classifier_name, classifier in classifiers.items():
    pipe_name = f'pipe_{classifier_name.lower().replace(" ", "_")}'
    classifier_pipeline = Pipeline([
        ('preproc', preprocessor),
        ('classifier', classifier)
    ])
    classifier_pipelines[pipe_name] = classifier_pipeline


> for each of the 8 classifier pipelines compute 5 different accuracy estimates:

  >> a) repeated_tt(X_train, y_train, pipe_XX, 1, YOUR_ID, 0.25)
  
  >> b) repeated_tt(X_train, y_train, pipe_XX, 100, YOUR_ID, 0.25)

  >> c) repeated_cv(X_train, y_train, pipe_XX, 1, YOUR_ID)
  
  >> d) repeated_cv(X_train, y_train, pipe_XX, 10, YOUR_ID)

  >> e) accuracy_score(y_test, pipe_XX.fit(X_train, y_train).predict(X_test))

  > Produce one summary table with all these 8*5=40 accuracy scores
  using one row per classifier pipeline, with 5 columns each
  in each column highlight the highest accuracy (use bold, or some colour)

In [None]:




results = {}

MY_ID = 1481257

# for each of the 8 classifier pipelines
for pipe_name, pipeline in classifier_pipelines.items():
    # compute 5 different accuracy estimate
    results[pipe_name] = [

        # a) repeated_tt(X_train, y_train, pipe_XX, 1, YOUR_ID, 0.25)
        repeated_tt(X_train, y_train, pipeline, 1, MY_ID, 0.25),

        # b) repeated_tt(X_train, y_train, pipe_XX, 100, YOUR_ID, 0.25)
        repeated_tt(X_train, y_train, pipeline, 100, MY_ID, 0.25),

        # c) repeated_cv(X_train, y_train, pipe_XX, 1, YOUR_ID)
        repeated_cv10(X_train, y_train, pipeline, 1, MY_ID),

        # d) repeated_cv(X_train, y_train, pipe_XX, 10, YOUR_ID)
        repeated_cv10(X_train, y_train, pipeline, 10, MY_ID),

        # e) accuracy_score(y_test, pipe_XX.fit(X_train, y_train).predict(X_test))
        accuracy_score(y_test, pipeline.fit(X_train, y_train).predict(X_test))
    ]

# Create a DataFrame to display the results
result_df = pd.DataFrame(results, index=['Single TT', 'Repeated(100) TT', 'Single CV', 'Repeated(10) CV', 'Test Set']).T

# Highlight the highest accuracy values in the DataFrame
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

# Produce one summary table with all these 8*5=40 accuracy scores using one row per classifier pipeline,
# with 5 columns each in each column highlight the highest accuracy
result_df.style.apply(highlight_max)





Unnamed: 0,Single TT,Repeated(100) TT,Single CV,Repeated(10) CV,Test Set
pipe_logistic_regression,0.961589,0.965166,0.966854,0.966854,0.972185
pipe_random_forest_(max_features=5),0.978808,0.982543,0.9831,0.983366,0.972185
pipe_random_forest_(max_features=sqrt),0.974834,0.984013,0.984758,0.984194,0.976159
pipe_random_forest_(max_features=25),0.982781,0.987351,0.989067,0.988338,0.980132
pipe_gaussian_naive_bayes,0.340397,0.290318,0.321847,0.321847,0.356291
pipe_k-nearest_neighbors_(n_neighbors=1),0.95894,0.963007,0.96321,0.96321,0.960265
pipe_k-nearest_neighbors_(n_neighbors=5),0.965563,0.967205,0.970173,0.970173,0.970861
pipe_k-nearest_neighbors_(n_neighbors=25),0.954967,0.958437,0.959564,0.959564,0.969536


  >   Which of the four "estimators":
*   1 repeated_tt
*   100 repeated_tt
*   1 repeated_cv
*   10 repeat_cv
<br><br>
  is most reliably identifying the
  method with the best "test" accuracy?

  > **Answer:**
  >> The pipe_random_forest_(max_features=25) classifier exhibits the best accuracy scores, so I will be considering this classifier as I answer this question.

>>The "Repeated(10) CV" estimator (0.988602) has the highest accuracy, making it the most reliable in identifying the method with the best "test" accuracy for this classifier.
<br><br><br>
>>The "Single CV" estimator (0.988074) also has the second highest accuracy, which makes it quite reliable.

>>The "Repeated(100) TT" estimator (0.987245) follows at third most accurate, indicating good accuracy but slightly less reliability compared to the CV estimators.

>>The "Single TT" estimator (0.980132	) is the fourth most accurate, making it less reliable than the other three but more reliable than the "Test Set" accuracy (0.978808).
<br><br>
>>In conclusion, of the four estimators, the "Repeated(10) CV" estimator is most reliably identifying the method with the best "test" accuracy.