In [2]:
# run if running locally
path = ''

In [3]:
#! remember to remove
%load_ext autoreload
%autoreload 2

import pandas as pd

from funcs.preparation import Preprocessing, FeatureSelection
from funcs.clustering import Clustering
from funcs.integration_exploration import Integration, Exploration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#setting seaborn visual style in plt

In [4]:
pd.set_option('display.max_rows', 1000)

In [5]:
metricFeatures: list[str] = ['Application order','Previous qualification score','Entry score','Age at enrollment','N units credited 1st period',
       'N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period','Social Popularity']
categoricalFeatures: list[str] =  ['Application mode','Marital status','Course','Previous qualification','Nationality','Mother\'s qualification',
       'Father\'s qualification','Mother\'s occupation','Father\'s occupation']
boolFeatures: list[str] = ['Morning shift participation','Displaced','Special needs','Debtor','Regularized Fees','Gender_Male','External Funding','International']

In [6]:
academicFeatures: list[str] = [
    'Entry score',
    'Average scored units',
    'Average grades',
    'Average units taken',
    'Average units approved',
    'Average units credited',
    'Average unscored units',
    'Success'
]

demographicFeatures: list[str] = [
    'Age at enrollment',
    'Marital status',
    'Course',
    'Previous qualification',
    'Nationality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Gender_Male'
]

In [7]:
def runAll(
    metricFeatures: list[str],
    categoricalFeatures: list[str],
    boolFeatures: list[str],
    academicFeatures: list[str],
    demographicFeatures: list[str],
    removedAcademicFeatures: list[str] | list = [],
    removedDemographicFeatures: list[str] | list = [],
    path: str = '',
    *,
    describeDF: bool = False,
    featureSelection: bool = False,
    getPairPlot: bool = False,
    elbowGraph: bool = True, 
    silhouetteGraph: bool = True, 
    dendrogram: bool = True,
    clusterProfiling: bool = False,
    nClustersAcademic: int = 0,
    nClustersDemographic: int = 0,
) -> None:
    """_summary_

    Args:
        metricFeatures (list[str]): _description_
        categoricalFeatures (list[str]): _description_
        boolFeatures (list[str]): _description_
        academicFeatures (list[str]): _description_
        demographicFeatures (list[str]): _description_
        path (str, optional): _description_. Defaults to ''.
        describeDF (bool, optional): _description_. Defaults to False.
        getPairPlot (bool, optional): No, just don't please. Defaults to False.
        nClustersAcademic (int, optional): _description_. Defaults to 4.
        nClustersDemographic (int, optional): _description_. Defaults to 4.
    """    
    dataDF: pd.DataFrame = Integration.importdata(path)

    if describeDF:
        Exploration.describeData(dataDF, metricFeatures, categoricalFeatures, getBoxPlots=True)
        return

    dataDF, dataAcademic, dataDemographic, scaler = Preprocessing.runPreprocessing(dataDF, metricFeatures, boolFeatures, academicFeatures, demographicFeatures, removedAcademicFeatures, removedDemographicFeatures, grouping="high")

    if featureSelection:
        if getPairPlot:
            FeatureSelection.pairPlots(dataAcademic, "Academic")
            FeatureSelection.pairPlots(dataDemographic, "Demographic")

        FeatureSelection.checkCorr(dataAcademic)
        FeatureSelection.checkCorr(dataDemographic)
        return

    if nClustersAcademic == 0:
        Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)
        return
    
    dataAcademic = Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)

    dataDemographic = Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)

    if clusterProfiling:
        FeatureSelection.clusterProfiles(dataAcademic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataAcademic)
        FeatureSelection.clusterProfiles(dataDemographic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataDemographic)
        return

    display(dataAcademic.groupby(['label']).describe().T)
    display(dataDemographic.groupby(['label']).describe().T)
    display(Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler))

In [8]:
removedAcademicFeatures: list[str] = [
    'Average units taken',
    'Average units credited',
    'Average unscored units'
]

removedDemographicFeatures: list[str] = [
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Marital status-together',
    'Marital status-nan',
    'Father\'s qualification-None',
    'Mother\'s occupation-Service Industry',
    'Mother\'s occupation-White collar Jobs',
    'Father\'s occupation-Professional Fields',
    'Father\'s occupation-Service Industry',
    'Father\'s occupation-Technical and Skilled Trades',
    'Father\'s occupation-White collar Jobs',
    'Father\'s occupation-nan',
    'Mother\'s qualification-None',
    'Mother\'s qualification-nan',
    'Mother\'s occupation-Technical and Skilled Trades',
    'Mother\'s occupation-nan',
    'Father\'s occupation-Security and Armed Forces',
    'Father\'s qualification-nan',
    'Nationality-Angolan',
    'Nationality-Brazilian',
    'Nationality-Cape Verdean',
    'Nationality-Colombian',
    'Nationality-Cuban',
    'Nationality-Dutch',
    'Nationality-English',
    'Nationality-German',
    'Nationality-Guinean',
    'Nationality-Italian',
    'Nationality-Lithuanian',
    'Nationality-Mexican',
    'Nationality-Moldova',
    'Nationality-Mozambican',
    'Nationality-Portuguese',
    'Nationality-Romanian',
    'Nationality-Russian',
    'Nationality-Santomean',
    'Nationality-Spanish',
    'Nationality-Turkish',
    'Nationality-Ukrainian',
    'Nationality-nan',
    'Course-Advertising and Marketing Management',
    'Course-Agronomy',
    'Course-Animation and Multimedia Design',
    'Course-Basic Education',
    'Course-Biofuel Production Technologies',
    'Course-Communication Design',
    'Course-Equinculture',
    'Course-Informatics Engineering',
    'Course-Journalism and Communication',
    'Course-Management',
    'Course-Management (evening attendance)',
    'Course-Nursing',
    'Course-Oral Hygiene',
    'Course-Social Service',
    'Course-Social Service (evening attendance)',
    'Course-Tourism',
    'Course-Veterinary Nursing',
    'Course-nan',
    'Previous qualification-Higher',
    'Previous qualification-Lower',
    'Previous qualification-Middle',
    'Previous qualification-nan'
]

In [None]:
runAll(
    metricFeatures,
    categoricalFeatures,
    boolFeatures,
    academicFeatures,
    demographicFeatures,
    removedAcademicFeatures,
    removedDemographicFeatures,
    path,
    describeDF=False,
    featureSelection=False,
    getPairPlot=False,
    elbowGraph=False,
    silhouetteGraph=False,
    dendrogram=False,
    clusterProfiling=False,
    nClustersAcademic=3,
    nClustersDemographic=3,
)