In [None]:
# Run if using on google collab, change path accordingly
from google.colab import drive # type: ignore
drive.mount('/content/drive', force_remount=True)
path = '/content/drive/MyDrive/[02] School/[01] University/[02] Bachelor\'s Year 2/[02] Spring Semester/[04] Machine Learning/Colab Notebooks/ML - LGI/mlproj/Descriptive Project'

In [None]:
# # Run if running locally
# path = ''

In [None]:
%cd "{path}"

In [None]:
!pip install git+https://github.com/sevamoo/SOMPY.git #egg=SOMPY
!pip install scikit-image
!pip install --upgrade scikit-learn

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
from sklearn.preprocessing import MinMaxScaler
from IPython.display import Markdown

from funcs.feature_engineering import Preprocessing, FeatureSelection
from funcs.clustering import Clustering
from funcs.integration_exploration import Integration, Exploration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.getLogger('matplotlib.font_manager').disabled = True

In [None]:
metricFeatures: list[str] = ['Application order','Previous qualification score','Entry score','Age at enrollment','N units credited 1st period',
       'N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period','Social Popularity']
categoricalFeatures: list[str] =  ['Application mode','Marital status','Course','Previous qualification','Nationality','Mother\'s qualification',
       'Father\'s qualification','Mother\'s occupation','Father\'s occupation']
boolFeatures: list[str] = ['Morning shift participation','Displaced','Special needs','Debtor','Regularized Fees','Gender_Male','External Funding','International']

In [None]:
academicFeatures: list[str] = [
    'Entry score',
    'Average scored units',
    'Average grades',
    'Average units taken',
    'Average units approved',
    'Average units credited',
    'Average unscored units',
    'Success'
]

demographicFeatures: list[str] = [
    'Age at enrollment',
    'Marital status',
    'Course',
    'Previous qualification',
    'Nationality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Gender_Male'
]

In [None]:
def runAll(
    metricFeatures: list[str],
    categoricalFeatures: list[str],
    boolFeatures: list[str],
    academicFeatures: list[str],
    demographicFeatures: list[str],
    removedAcademicFeatures: list[str] | list = [],
    removedDemographicFeatures: list[str] | list = [],
    path: str = '',
    *,
    describeDF: bool = False,
    featureSelection: bool = False,
    getPairPlot: bool = False,
    elbowGraph: bool = True, 
    silhouetteGraph: bool = True, 
    dendrogram: bool = True,
    clusterProfiling: bool = False,
    nClustersAcademic: int = 0,
    nClustersDemographic: int = 0,
) -> None:
    """Main Function

    Args:
        metricFeatures (list[str]): Metric features of the dataframe
        categoricalFeatures (list[str]): Categorical features of the dataframe
        boolFeatures (list[str]): Boolean features of the dataframe
        academicFeatures (list[str]): Features of the academic perspective
        demographicFeatures (list[str]): Features of the demographic perspective
        removedAcademicFeatures (list[str] | list, optional): Features to be removed from the academic perspective. Defaults to [].
        removedDemographicFeatures (list[str] | list, optional): Features to be removed from the demographic perspective Defaults to [].
        path (str, optional): Path from which to get the data. Defaults to ''.
        describeDF (bool, optional): Whether or not to return information about the dataframe, if set to True, stops function after obtaining the data. Defaults to False.
        featureSelection (bool, optional): Whether or not to return feature selection information about the dataframe, if set to True, stops function after obtaining the information. Defaults to False.
        getPairPlot (bool, optional): Whether or not to return pair plots of the data when running feature selection, only relevant if featureSelection is set to True. WARNING: Takes a very long time to run. Defaults to False.
        elbowGraph (bool, optional): Whether or not to display the elbow graph of the data when obtaining kmeans graphs, only relevant when nClustersDemographic is 0. Defaults to True.
        silhouetteGraph (bool, optional): Whether or not to display the silhouette scores graph of the data when obtaining kmeans graphs, only relevant when nClustersDemographic is 0. Defaults to True.
        dendrogram (bool, optional): Whether or not to display the dendrogram of the data when obtaining kmeans graphs, only relevant when nClustersDemographic is 0. Defaults to True.
        clusterProfiling (bool, optional): Whether or not to profile the clusters, stops running the function after obtaining the information. Defaults to False.
        nClustersAcademic (int, optional): Number of clusters to run on the academic perspective. If set to 0, displays a U matrix and Component Planes. Defaults to 0.
        nClustersDemographic (int, optional): Number of clusters to run on the demographic perspective. If set to 0 displays the enabled kmeans graphs. Defaults to 0.
    """
    dataDF: pd.DataFrame = Integration.importdata(path)
    dataAcademic: pd.DataFrame
    dataDemographic: pd.DataFrame
    scaler: MinMaxScaler
    academicScaler: MinMaxScaler
    demographicScaler: MinMaxScaler

    if describeDF:
        display(Exploration.describeData(dataDF, metricFeatures, categoricalFeatures))
        return

    dataDF, dataAcademic, dataDemographic, scaler, academicScaler, demographicScaler = Preprocessing.runPreprocessing(dataDF, metricFeatures, boolFeatures, academicFeatures, demographicFeatures, removedAcademicFeatures, removedDemographicFeatures, grouping="high")

    if featureSelection:
        if getPairPlot:
            display(FeatureSelection.pairPlots(dataAcademic, "Academic"))
            display(FeatureSelection.pairPlots(dataDemographic, "Demographic"))

        display(FeatureSelection.checkCorr(dataAcademic))
        display(FeatureSelection.checkCorr(dataDemographic))
        return

    if nClustersAcademic == 0:
        Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)
        return
    
    dataAcademic = Clustering.somWrapper(dataAcademic, "Academic", nClustersAcademic)

    if nClustersDemographic == 0:
        Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)
        return
    
    dataDemographic = Clustering.runKMeans(dataDemographic,nClustersDemographic, elbowGraph=elbowGraph, silhouetteGraph=silhouetteGraph, dendrogram=dendrogram)

    if clusterProfiling:
        display(Markdown("### Academic Perspective"))
        FeatureSelection.clusterProfiles(dataAcademic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataAcademic, "label")

        display(Markdown("### Demographic Perspective"))
        FeatureSelection.clusterProfiles(dataDemographic, ["label"], (32, 10))
        FeatureSelection.getVariableClusterGraphs(dataDemographic, "label")

        display(Markdown("### Final Results"))
        FeatureSelection.clusterProfiles(Preprocessing.scaleData(Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler))[0], ["final_groups"], (50, 10))
        FeatureSelection.getVariableClusterGraphs((Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler)), "final_groups")
        return

    display(Markdown("### Academic Perspective"))
    tempDF = dataAcademic.drop("label", axis=1)
    tempDF = pd.DataFrame(academicScaler.inverse_transform(tempDF), index=tempDF.index, columns=tempDF.columns)
    tempDF = tempDF.join(dataAcademic["label"])
    display(tempDF.groupby(['label']).describe().T)

    display(Markdown("### Demographic Perspective"))
    tempDF = dataDemographic.drop("label", axis=1)
    tempDF = pd.DataFrame(demographicScaler.inverse_transform(tempDF), index=tempDF.index, columns=tempDF.columns)
    tempDF = tempDF.join(dataDemographic["label"])
    display(tempDF.groupby(['label']).describe().T)
    
    display(Markdown("### Final Results"))
    display(Clustering.mergePerspectives(dataDF, dataAcademic, dataDemographic, scaler).groupby(['final_groups']).describe().T)

In [None]:
removedAcademicFeatures: list[str] = [
    'Average units taken',
    'Average units credited',
    'Average unscored units'
]

removedDemographicFeatures: list[str] = [
    'Special needs',
    'International',
    'Debtor',
    'Regularized Fees',
    'Marital status-together',
    'Marital status-nan',
    'Father\'s qualification-None',
    'Mother\'s occupation-Service Industry',
    'Mother\'s occupation-White collar Jobs',
    'Father\'s occupation-Professional Fields',
    'Father\'s occupation-Service Industry',
    'Father\'s occupation-Technical and Skilled Trades',
    'Father\'s occupation-White collar Jobs',
    'Father\'s occupation-nan',
    'Mother\'s qualification-None',
    'Mother\'s qualification-nan',
    'Mother\'s occupation-Technical and Skilled Trades',
    'Mother\'s occupation-nan',
    'Father\'s occupation-Security and Armed Forces',
    'Father\'s qualification-nan',
    'Nationality-Angolan',
    'Nationality-Brazilian',
    'Nationality-Cape Verdean',
    'Nationality-Colombian',
    'Nationality-Cuban',
    'Nationality-Dutch',
    'Nationality-English',
    'Nationality-German',
    'Nationality-Guinean',
    'Nationality-Italian',
    'Nationality-Lithuanian',
    'Nationality-Mexican',
    'Nationality-Moldova',
    'Nationality-Mozambican',
    'Nationality-Portuguese',
    'Nationality-Romanian',
    'Nationality-Russian',
    'Nationality-Santomean',
    'Nationality-Spanish',
    'Nationality-Turkish',
    'Nationality-Ukrainian',
    'Nationality-nan',
    'Course-Advertising and Marketing Management',
    'Course-Agronomy',
    'Course-Animation and Multimedia Design',
    'Course-Basic Education',
    'Course-Biofuel Production Technologies',
    'Course-Communication Design',
    'Course-Equinculture',
    'Course-Informatics Engineering',
    'Course-Journalism and Communication',
    'Course-Management',
    'Course-Management (evening attendance)',
    'Course-Nursing',
    'Course-Oral Hygiene',
    'Course-Social Service',
    'Course-Social Service (evening attendance)',
    'Course-Tourism',
    'Course-Veterinary Nursing',
    'Course-nan',
    'Previous qualification-Higher',
    'Previous qualification-Lower',
    'Previous qualification-Middle',
    'Previous qualification-nan'
]

In [None]:
runAll(
    metricFeatures,
    categoricalFeatures,
    boolFeatures,
    academicFeatures,
    demographicFeatures,
    removedAcademicFeatures,
    removedDemographicFeatures,
    path=path,
    describeDF=False,
    featureSelection=False,
    getPairPlot=False,
    elbowGraph=True,
    silhouetteGraph=True,
    dendrogram=True,
    clusterProfiling=False,
    nClustersAcademic=3,
    nClustersDemographic=3
)