In [9]:
# run if running locally
path = ''

In [10]:
#! remember to remove
%load_ext autoreload
%autoreload 2

import pandas as pd

from funcs.preparation import Preprocessing, FeatureSelection
from funcs.clustering import Clustering
from funcs.integration_exploration import Integration, Exploration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#setting seaborn visual style in plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
pd.set_option('display.max_rows', 1000)

In [12]:
metricFeatures: list[str] = ['Application order','Previous qualification score','Entry score','Age at enrollment','N units credited 1st period',
       'N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period','Social Popularity']
categoricalFeatures: list[str] =  ['Application mode','Marital status','Course','Previous qualification','Nationality','Mother\'s qualification',
       'Father\'s qualification','Mother\'s occupation','Father\'s occupation']
boolFeatures: list[str] = ['Morning shift participation','Displaced','Special needs','Debtor','Regularized Fees','Gender_Male','External Funding','International']

In [13]:
academicFeatures: list[str] = [
    'Entry score',
    'N units credited 1st period',
    'N units taken 1st period',
    'N scored units 1st period',
    'N units approved 1st period',
    'Average grade 1st period',
    'N unscored units 1st period',
    'N units credited 2nd period',
    'N units taken 2nd period',
    'N scored units 2nd period',
    'N units approved 2nd period',
    'Average grade 2nd period',
    'N unscored units 2nd period',
    'Success'
]

demographicFeatures: list[str] = [
    'Age at enrollment',
    'Marital status',
    'Course',
    'Previous qualification',
    'Nationality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Gender_Male'
]

In [14]:
def runAll(
    metricFeatures: list[str],
    categoricalFeatures: list[str],
    boolFeatures: list[str],
    academicFeatures: list[str],
    demographicFeatures: list[str],
    removedAcademicFeatures: list[str] | list = [],
    removedDemographicFeatures: list[str] | list = [],
    path: str = '',
    *,
    describeDF: bool = False,
    featureSelection: bool = False,
    getPairPlot: bool = False,
    elbowGraph: bool = True, 
    silhouetteGraph: bool = True, 
    dendrogram: bool = True,
    clusterProfiling: bool = False,
    nClustersAcademic: int = 0,
    nClustersDemographic: int = 0,
) -> None:
    """_summary_

    Args:
        metricFeatures (list[str]): _description_
        categoricalFeatures (list[str]): _description_
        boolFeatures (list[str]): _description_
        academicFeatures (list[str]): _description_
        demographicFeatures (list[str]): _description_
        path (str, optional): _description_. Defaults to ''.
        describeDF (bool, optional): _description_. Defaults to False.
        getPairPlot (bool, optional): No, just don't please. Defaults to False.
        nClustersAcademic (int, optional): _description_. Defaults to 4.
        nClustersDemographic (int, optional): _description_. Defaults to 4.
    """    
    dataDF: pd.DataFrame = Integration.importdata(path, False)

    if describeDF:
        Exploration.describeData(dataDF, metricFeatures, categoricalFeatures, getBoxPlots=True)
        return

    dataDF, dataAcademic, dataDemographic, scaler = Preprocessing.runPreprocessing(dataDF, metricFeatures, boolFeatures, academicFeatures, demographicFeatures, removedAcademicFeatures, removedDemographicFeatures, grouping="high")

    if featureSelection:
        if getPairPlot:
            FeatureSelection.pairPlots(dataAcademic, "Academic")
            FeatureSelection.pairPlots(dataDemographic, "Demographic")

        FeatureSelection.checkCorr(dataAcademic)
        FeatureSelection.checkCorr(dataDemographic)
        return

    if nClustersAcademic == 0:
        Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)
        return
    
    dataAcademic = Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)

    dataDemographic = Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)

    if clusterProfiling:
        FeatureSelection.clusterProfiles(dataAcademic.drop("bmu", axis=1), ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataAcademic)
        FeatureSelection.clusterProfiles(dataDemographic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataDemographic)
        return

    display(dataAcademic.groupby(['label']).describe().T)
    display(dataDemographic.groupby(['label']).describe().T)
    display(Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler))
    

In [15]:
removedAcademicFeatures: list[str] = [
    'Average units taken',
    'Average units credited',
    'Average unscored units'
]

removedDemographicFeatures: list[str] = [
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Marital status-together',
    'Marital status-nan',
    'Father\'s qualification-None',
    'Mother\'s occupation-Service Industry',
    'Mother\'s occupation-White collar Jobs',
    'Father\'s occupation-Professional Fields',
    'Father\'s occupation-Service Industry',
    'Father\'s occupation-Technical and Skilled Trades',
    'Father\'s occupation-White collar Jobs',
    'Father\'s occupation-nan',
    'Mother\'s qualification-None',
    'Mother\'s qualification-nan',
    'Mother\'s occupation-Technical and Skilled Trades',
    'Mother\'s occupation-nan',
    'Father\'s occupation-Security and Armed Forces',
    'Father\'s qualification-nan',
    'Nationality-Angolan',
    'Nationality-Brazilian',
    'Nationality-Cape Verdean',
    'Nationality-Colombian',
    'Nationality-Cuban',
    'Nationality-Dutch',
    'Nationality-English',
    'Nationality-German',
    'Nationality-Guinean',
    'Nationality-Italian',
    'Nationality-Lithuanian',
    'Nationality-Mexican',
    'Nationality-Moldova',
    'Nationality-Mozambican',
    'Nationality-Portuguese',
    'Nationality-Romanian',
    'Nationality-Russian',
    'Nationality-Santomean',
    'Nationality-Spanish',
    'Nationality-Turkish',
    'Nationality-Ukrainian',
    'Nationality-nan',
    'Course-Advertising and Marketing Management',
    'Course-Agronomy',
    'Course-Animation and Multimedia Design',
    'Course-Basic Education',
    'Course-Biofuel Production Technologies',
    'Course-Communication Design',
    'Course-Equinculture',
    'Course-Informatics Engineering',
    'Course-Journalism and Communication',
    'Course-Management',
    'Course-Management (evening attendance)',
    'Course-Nursing',
    'Course-Oral Hygiene',
    'Course-Social Service',
    'Course-Social Service (evening attendance)',
    'Course-Tourism',
    'Course-Veterinary Nursing',
    'Course-nan',
    'Previous qualification-Higher',
    'Previous qualification-Lower',
    'Previous qualification-Middle',
    'Previous qualification-nan'
]

In [None]:
dataDF: pd.DataFrame = Integration.importdata(path, False)
dataDF, dataAcademic, dataDemographic, scaler = Preprocessing.runPreprocessing(dataDF, metricFeatures, boolFeatures, academicFeatures, demographicFeatures, removedAcademicFeatures, removedDemographicFeatures, grouping="high")
dataDemographic = Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)

In [16]:
runAll(
    metricFeatures,
    categoricalFeatures,
    boolFeatures,
    academicFeatures,
    demographicFeatures,
    removedAcademicFeatures,
    removedDemographicFeatures,
    path,
    describeDF=False,
    getPairPlot=False,
    elbowGraph=False,
    silhouetteGraph=False,
    dendrogram=False,
    clusterProfiling=False,
    nClustersAcademic=3,
    nClustersDemographic=3,
)

Unnamed: 0,label,0,1,2
Entry score,count,2134.0,784.0,1327.0
Entry score,mean,0.353097,0.32134,0.318337
Entry score,std,0.149124,0.17422,0.139462
Entry score,min,0.0,0.0,0.0
Entry score,25%,0.256842,0.210526,0.23
Entry score,50%,0.339474,0.303158,0.305263
Entry score,75%,0.433684,0.411842,0.398947
Entry score,max,1.0,0.931579,1.0
Success,count,2134.0,784.0,1327.0
Success,mean,0.987816,0.084821,0.247174


Unnamed: 0,label,0,1,2
Age at enrollment,count,1380.0,1666.0,1199.0
Age at enrollment,mean,0.152899,0.078881,0.118322
Age at enrollment,std,0.166777,0.092138,0.149937
Age at enrollment,min,0.018868,0.0,0.0
Age at enrollment,25%,0.037736,0.018868,0.037736
Age at enrollment,50%,0.075472,0.037736,0.056604
Age at enrollment,75%,0.226415,0.075472,0.132075
Age at enrollment,max,0.811321,0.849057,1.0
Gender_Male,count,1380.0,1666.0,1199.0
Gender_Male,mean,0.335507,0.39976,0.274395


Unnamed: 0,final_groups,0,1,2,3,4,5,6,7,8
Entry score,count,821.0,546.0,378.0,638.0,299.0,675.0,302.0,183.0,403.0
Entry score,mean,129.308404,125.801099,124.30873,127.514577,127.651171,128.588,124.245695,124.172131,125.359801
Entry score,std,14.72516,13.276038,12.5783,13.694371,17.470476,13.870434,15.736226,16.038687,13.799125
Entry score,min,98.1,96.0,95.0,95.0,95.0,95.0,95.0,96.0,95.0
Entry score,25%,119.8,117.225,116.925,118.425,116.95,120.0,113.85,113.25,115.5
Entry score,50%,128.0,124.3,123.15,126.4,125.8,127.1,122.75,121.7,124.4
Entry score,75%,137.5,132.9,131.7,135.0,138.3,135.85,131.925,130.8,133.95
Entry score,max,190.0,190.0,170.0,174.7,183.5,180.0,170.0,176.7,170.0
Gender_Male,count,821.0,546.0,378.0,638.0,299.0,675.0,302.0,183.0,403.0
Gender_Male,mean,0.285018,0.47619,0.338624,0.178683,0.575251,0.237037,0.516556,0.47541,0.364764
