In [4]:
# run if running locally
path = ''

In [10]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from funcs.preprocessing import Preprocessing
from funcs.feature_selection import FeatureSelection
from funcs.clustering import Clustering
from funcs.integration_exploration import Integration
from funcs.integration_exploration import Exploration


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#setting seaborn visual style in plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Integration & Exploration

In [6]:
metricFeatures: list[str] = ['Application order','Previous qualification score','Entry score','Age at enrollment','N units credited 1st period',
       'N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period','Social Popularity']
categoricalFeatures: list[str] =  ['Application mode','Marital status','Course','Previous qualification','Nationality','Mother\'s qualification',
       'Father\'s qualification','Mother\'s occupation','Father\'s occupation']
boolFeatures: list[str] = ['Morning shift participation','Displaced','Special needs','Debtor','Regularized Fees','Gender_Male','External Funding','International']

In [None]:
academicFeatures: list[str] = ['Previous qualification score','Entry score','N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period']
demographicFeatures: list[str] = ['Age at enrollment','Marital status','Course','Previous qualification','Nationality','Mother\'s qualification',
       'Father\'s qualification','Mother\'s occupation','Father\'s occupation','Special needs','International','Debtor','Regularized Fees','Gender_Male']

In [None]:
def runAll(
    metricFeatures: list[str],
    categoricalFeatures: list[str],
    boolFeatures: list[str],
    academicFeatures: list[str],
    demographicFeatures: list[str],
    path: str = '',
    *,
    describeDF: bool = False,
    getPairPlot: bool = False,
    getClusters: bool = True,
    nClusters: int = 4
) -> None:
    """Runs the algorithm

    Args:
        X_train (pd.DataFrame): Raw/Un-preprocessed X_train dataframe
        Y_train (pd.DataFrame): Raw/Un-preprocessed Y_train dataframe
        X_test (pd.DataFrame): Raw/Un-preprocessed X_test dataframe
        split (bool, optional): Whether or not to split the dataframes using train_test_split, used for model assessment. Defaults to False.
    """
    dataDF: pd.DataFrame = Integration.importdata(path, False)

    if describeDF:
        Exploration.describeData(dataDF, metricFeatures, categoricalFeatures, getBoxPlots=True)
        return

    dataDF, dataAcademic, dataDemographic = Preprocessing.preprocessingWrapper(dataDF, metricFeatures, boolFeatures, academicFeatures, demographicFeatures)

    if getPairPlot:
        FeatureSelection.pairPlots(dataAcademic, "AcademicPairPlot")
        FeatureSelection.pairPlots(dataDemographic, "DemographicPairPlot")

    Clustering.somWrapper(dataAcademic, getClusters, nClusters)
    Clustering.somWrapper(dataDemographic, getClusters, nClusters)

In [None]:
runAll(dataDF, academicFeatures, demographicFeatures)