In [127]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
from sklearn.preprocessing import MinMaxScaler
from IPython.display import Markdown

from funcs.feature_engineering import Preprocessing, FeatureSelection
from funcs.clustering import Clustering
from funcs.integration_exploration import Integration, Exploration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.getLogger('matplotlib.font_manager').disabled = True

In [128]:
path = ''

In [129]:
metricFeatures: list[str] = ['Application order','Previous qualification score','Entry score','Age at enrollment','N units credited 1st period',
       'N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period','Social Popularity']
categoricalFeatures: list[str] =  ['Application mode','Marital status','Course','Previous qualification','Nationality','Mother\'s qualification',
       'Father\'s qualification','Mother\'s occupation','Father\'s occupation']
boolFeatures: list[str] = ['Morning shift participation','Displaced','Special needs','Debtor','Regularized Fees','Gender_Male','External Funding','International']

In [130]:
academicFeatures: list[str] = [
    'Entry score',
    'Average scored units',
    'Average grades',
    'Average units taken',
    'Average units approved',
    'Average units credited',
    'Average unscored units',
    'Success'
]

demographicFeatures: list[str] = [
    'Age at enrollment',
    'Marital status',
    'Course',
    'Previous qualification',
    'Nationality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Gender_Male'
]

In [131]:
def runAll(
    metricFeatures: list[str],
    categoricalFeatures: list[str],
    boolFeatures: list[str],
    academicFeatures: list[str],
    demographicFeatures: list[str],
    removedAcademicFeatures: list[str] | list = [],
    removedDemographicFeatures: list[str] | list = [],
    path: str = '',
    *,
    describeDF: bool = False,
    featureSelection: bool = False,
    getPairPlot: bool = False,
    elbowGraph: bool = True, 
    silhouetteGraph: bool = True, 
    dendrogram: bool = True,
    clusterProfiling: bool = False,
    nClustersAcademic: int = 0,
    nClustersDemographic: int = 0,
) -> None:
    """_summary_

    Args:
        metricFeatures (list[str]): _description_
        categoricalFeatures (list[str]): _description_
        boolFeatures (list[str]): _description_
        academicFeatures (list[str]): _description_
        demographicFeatures (list[str]): _description_
        removedAcademicFeatures (list[str] | list, optional): _description_. Defaults to [].
        removedDemographicFeatures (list[str] | list, optional): _description_. Defaults to [].
        path (str, optional): _description_. Defaults to ''.
        describeDF (bool, optional): _description_. Defaults to False.
        featureSelection (bool, optional): _description_. Defaults to False.
        getPairPlot (bool, optional): _description_. Defaults to False.
        elbowGraph (bool, optional): _description_. Defaults to True.
        silhouetteGraph (bool, optional): _description_. Defaults to True.
        dendrogram (bool, optional): _description_. Defaults to True.
        clusterProfiling (bool, optional): _description_. Defaults to False.
        nClustersAcademic (int, optional): _description_. Defaults to 0.
        nClustersDemographic (int, optional): _description_. Defaults to 0.
    """
    dataDF: pd.DataFrame = Integration.importdata(path)
    dataAcademic: pd.DataFrame
    dataDemographic: pd.DataFrame
    scaler: MinMaxScaler
    academicScaler: MinMaxScaler
    demographicScaler: MinMaxScaler

    if describeDF:
        display(Exploration.describeData(dataDF, metricFeatures, categoricalFeatures))
        return

    dataDF, dataAcademic, dataDemographic, scaler, academicScaler, demographicScaler = Preprocessing.runPreprocessing(dataDF, metricFeatures, boolFeatures, academicFeatures, demographicFeatures, removedAcademicFeatures, removedDemographicFeatures, grouping="high")

    if featureSelection:
        if getPairPlot:
            display(FeatureSelection.pairPlots(dataAcademic, "Academic"))
            display(FeatureSelection.pairPlots(dataDemographic, "Demographic"))

        display(FeatureSelection.checkCorr(dataAcademic))
        display(FeatureSelection.checkCorr(dataDemographic))
        return

    if nClustersAcademic == 0:
        Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)
        return
    
    dataAcademic = Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)

    if nClustersDemographic == 0:
        Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)
        return
    
    dataDemographic = Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)

    if clusterProfiling:
        display(Markdown("### Academic Perspective"))
        FeatureSelection.clusterProfiles(dataAcademic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataAcademic, "label")

        display(Markdown("### Demographic Perspective"))
        FeatureSelection.clusterProfiles(dataDemographic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataDemographic, "label")

        display(Markdown("### Final Results"))
        FeatureSelection.clusterProfiles(Preprocessing.scaleData(Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler))[0], ["final_groups"], (50, 10))
        FeatureSelection.getVariableClusterGraphs((Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler)), "final_groups")
        return

    display(Markdown("### Academic Perspective"))
    tempDF = dataAcademic.drop("label", axis=1)
    tempDF = pd.DataFrame(academicScaler.inverse_transform(tempDF), index=tempDF.index, columns=tempDF.columns)
    tempDF = tempDF.join(dataAcademic["label"])
    display(tempDF.groupby(['label']).describe().T)

    display(Markdown("### Demographic Perspective"))
    tempDF = dataDemographic.drop("label", axis=1)
    tempDF = pd.DataFrame(demographicScaler.inverse_transform(tempDF), index=tempDF.index, columns=tempDF.columns)
    tempDF = tempDF.join(dataDemographic["label"])
    display(tempDF.groupby(['label']).describe().T)
    
    display(Markdown("### Final Results"))
    display(Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler).groupby(['final_groups']).describe().T)

In [132]:
removedAcademicFeatures: list[str] = [
    'Average units taken',
    'Average units credited',
    'Average unscored units'
]

removedDemographicFeatures: list[str] = [
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Marital status-together',
    'Marital status-nan',
    'Father\'s qualification-None',
    'Mother\'s occupation-Service Industry',
    'Mother\'s occupation-White collar Jobs',
    'Father\'s occupation-Professional Fields',
    'Father\'s occupation-Service Industry',
    'Father\'s occupation-Technical and Skilled Trades',
    'Father\'s occupation-White collar Jobs',
    'Father\'s occupation-nan',
    'Mother\'s qualification-None',
    'Mother\'s qualification-nan',
    'Mother\'s occupation-Technical and Skilled Trades',
    'Mother\'s occupation-nan',
    'Father\'s occupation-Security and Armed Forces',
    'Father\'s qualification-nan',
    'Nationality-Angolan',
    'Nationality-Brazilian',
    'Nationality-Cape Verdean',
    'Nationality-Colombian',
    'Nationality-Cuban',
    'Nationality-Dutch',
    'Nationality-English',
    'Nationality-German',
    'Nationality-Guinean',
    'Nationality-Italian',
    'Nationality-Lithuanian',
    'Nationality-Mexican',
    'Nationality-Moldova',
    'Nationality-Mozambican',
    'Nationality-Portuguese',
    'Nationality-Romanian',
    'Nationality-Russian',
    'Nationality-Santomean',
    'Nationality-Spanish',
    'Nationality-Turkish',
    'Nationality-Ukrainian',
    'Nationality-nan',
    'Course-Advertising and Marketing Management',
    'Course-Agronomy',
    'Course-Animation and Multimedia Design',
    'Course-Basic Education',
    'Course-Biofuel Production Technologies',
    'Course-Communication Design',
    'Course-Equinculture',
    'Course-Informatics Engineering',
    'Course-Journalism and Communication',
    'Course-Management',
    'Course-Management (evening attendance)',
    'Course-Nursing',
    'Course-Oral Hygiene',
    'Course-Social Service',
    'Course-Social Service (evening attendance)',
    'Course-Tourism',
    'Course-Veterinary Nursing',
    'Course-nan',
    'Previous qualification-Higher',
    'Previous qualification-Lower',
    'Previous qualification-Middle',
    'Previous qualification-nan'
]

In [134]:
runAll(
    metricFeatures,
    categoricalFeatures,
    boolFeatures,
    academicFeatures,
    demographicFeatures,
    removedAcademicFeatures,
    removedDemographicFeatures,
    path=path,
    describeDF=False,
    featureSelection=False,
    getPairPlot=False,
    elbowGraph=True,
    silhouetteGraph=True,
    dendrogram=True,
    clusterProfiling=False,
    nClustersAcademic=3,
    nClustersDemographic=3
)



### Academic Perspective

Unnamed: 0,label,0,1,2
Entry score,count,2106.0,816.0,1323.0
Entry score,mean,128.158879,126.325245,125.426228
Entry score,std,14.107065,16.621587,13.351658
Entry score,min,95.0,95.0,95.0
Entry score,25%,119.1,115.425,117.0
Entry score,50%,127.0,124.8,124.3
Entry score,75%,135.7,136.1,133.0
Entry score,max,190.0,183.5,190.0
Average scored units,count,2106.0,816.0,1323.0
Average scored units,mean,7.98433,3.811275,9.377929


### Demographic Perspective

Unnamed: 0,label,0,1,2
Age at enrollment,count,1380.0,1666.0,1199.0
Age at enrollment,mean,25.103623,21.180672,23.271059
Age at enrollment,std,8.839155,4.8833,7.946656
Age at enrollment,min,18.0,17.0,17.0
Age at enrollment,25%,19.0,18.0,19.0
Age at enrollment,50%,21.0,19.0,20.0
Age at enrollment,75%,29.0,21.0,24.0
Age at enrollment,max,60.0,62.0,70.0
Gender_Male,count,1380.0,1666.0,1199.0
Gender_Male,mean,0.335507,0.39976,0.274395


### Final Results

Unnamed: 0,final_groups,0,1,2,3,4,5,6,7,8
Entry score,count,801.0,543.0,373.0,632.0,322.0,673.0,300.0,194.0,407.0
Entry score,mean,128.772534,125.954328,124.452547,127.193829,128.811491,128.33477,124.483,125.047423,125.614005
Entry score,std,14.612934,13.508768,12.634565,13.661685,17.311064,13.876708,15.910383,16.08858,13.758851
Entry score,min,98.1,96.0,95.0,95.0,95.0,95.0,95.0,96.0,95.0
Entry score,25%,119.4,117.55,117.1,118.075,117.925,120.0,113.625,113.35,116.15
Entry score,50%,127.5,124.6,123.3,126.3,127.3,127.0,123.35,122.8,124.8
Entry score,75%,136.3,132.95,131.8,134.5,140.0,135.0,133.225,133.3,134.1
Entry score,max,190.0,190.0,170.0,174.7,183.5,180.0,170.0,176.7,170.0
Gender_Male,count,801.0,543.0,373.0,632.0,322.0,673.0,300.0,194.0,407.0
Gender_Male,mean,0.274657,0.484346,0.343164,0.172468,0.568323,0.23477,0.52,0.474227,0.366093
