In [1]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.decomposition import PCA

def apply_pca(X, standardize=True):
    # Method to apply principal component analysis to dataset X with option to standardise the data

    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)

    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)

    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)

    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )

    return pca, X_pca, loadings

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    # Method for identifying the mutual information score about dataset X and Y

    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()

    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]

    # Initialise a mutual information object
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)

    #  Put the output table into a dataframe
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)

    # Sort values to identify the best MI score
    mi_scores = mi_scores.sort_values(ascending=False)

    return mi_scores

In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

def score_dataset(X, y, model=XGBRegressor()):
    # Method for scoring a dataset using a predefined model, a dataset according to a cross-validation scoring metrixc

    # Label encoding for categorical columns
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()

    # Metric is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )

    score = -1 * score.mean()
    score = np.sqrt(score)
    
    return score