In [1]:
# Run if using on google collab, change path if using on a copy
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path = '/content/drive/MyDrive/[02] School/[01] University/[02] Bachelor\'s Year 2/[02] Spring Semester/[04] Machine Learning/Colab Notebooks/ML - LGI/mlproj/'

Mounted at /content/drive


In [1]:
# run if running locally
path = ''

In [40]:
#standard initial libraries
from math import ceil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency

#sk-learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier;


#setting seaborn visual style in plt
sns.set_theme()

# Integration & Exploration

In [3]:
#import the data
traindata: pd.DataFrame = pd.read_csv(path + 'project_data/train.csv').set_index('Userid').drop('Observations', axis=1).drop_duplicates()
testdata: pd.DataFrame = pd.read_csv(path + 'project_data/test.csv').set_index('Userid').drop(['Registered', 'Observations'], axis=1)

traindata = traindata[traindata['Registered'] == 'Yes']
traindata = traindata.drop('Registered', axis=1)

In [4]:
metricFeatures: list[str] = ['Application order','Previous qualification score','Entry score','Age at enrollment','N units credited 1st period',
       'N units taken 1st period', 'N scored units 1st period',
       'N units approved 1st period', 'Average grade 1st period',
       'N unscored units 1st period', 'N units credited 2nd period',
       'N units taken 2nd period', 'N scored units 2nd period',
       'N units approved 2nd period', 'Average grade 2nd period',
       'N unscored units 2nd period','Social Popularity']
categoricalFeatures: list[str] =  ['Application mode','Marital status','Course','Previous qualification','Nationality',"Mother's qualification",
       "Father's qualification","Mother's occupation","Father's occupation"]
boolFeatures: list[str] = ['Morning shift participation','Displaced','Special needs','Debtor','Regularized Fees','Gender_Male','External Funding','International']

In [None]:
traindata.info()

In [None]:
for variable in categoricalFeatures:
  print(traindata[variable].value_counts())

In [50]:
X = traindata.drop(['Success'], axis = 1)
y = traindata['Success']

In [19]:
y.replace({'Succeeded': 2, 'Holding on': 1, 'Gave up': 0}, inplace = True)
# X_filled = X.fillna(X[metric_features].median())
# X_filled = X_filled.fillna(X_filled[categorical_features].mode().iloc[0])
# X_filled.drop(remlist, axis = 1, inplace = True)
X_filled = pd.get_dummies(X, dummy_na = False, drop_first = False)
X_train = X_filled
# X_train_filled.drop(['N units taken 2nd period', 'N units credited 2nd period', 'N units approved 2nd period'], axis = 1, inplace = True)

In [20]:
# testdata_filled = testdata.fillna(testdata[metric_features].median())
# testdata_filled = testdata_filled.fillna(testdata_filled[categorical_features].mode().iloc[0])
# testdata_filled.drop(remlist, axis = 1, inplace = True)
testdata_filled = pd.get_dummies(testdata, dummy_na=False, drop_first = False)
testdata = testdata_filled
# testdata_filled.drop(['N units taken 2nd period', 'N units credited 2nd period', 'N units approved 2nd period'], axis = 1, inplace = True)

In [None]:
X_train.isna().sum()

In [None]:
for i, col in enumerate(metric_features):
  plt.figure(i)
  sns.boxplot(x=col, data=X_train)

# Preprocessing

## Get Dummy variables

## Missing Values

In [None]:
X_train_filled = X_train.fillna(X_train[metric_features].median())

In [None]:
X_train_filled = X_train_filled.fillna(X_train_filled[categorical_features].mode().iloc[0])

In [None]:
for i, col in enumerate(metric_features):
  plt.figure(i)
  sns.boxplot(x=col, data=X_train_filled)

In [None]:
X_train_filled.isna().sum()

# Feature Selection

In [None]:
univariatecheck = X_train[metric_features].var()

In [None]:
for i in univariatecheck:
  print(np.round(i,5))

In [None]:
def cor_heatmap(cor):
    plt.figure(figsize=(12,10))
    sns.heatmap(data = cor, annot = True, cmap = plt.cm.Reds, fmt='.1')
    plt.show()

In [None]:
cor_spearman = pd.concat([X_train[metric_features], y_train], axis=1).corr(method ='spearman')
cor_heatmap(cor_spearman)

In [None]:
cor_kendall = pd.concat([X_train[metric_features], y_train], axis=1).corr(method ='kendall')
cor_heatmap(cor_kendall)

In [None]:
anova = SelectKBest(f_classif, k=5)

In [None]:
X_anova = anova.fit_transform(X_train_filled[metric_features], y_train)

In [None]:
selected_features = pd.Series(anova.get_support(), index = X_train_filled[metric_features].columns)
selected_features

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Mother's qualification")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Father's qualification")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Marital status")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Course")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Previous qualification")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Mother's occupation")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Father's occupation")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[[s for s in X_train.columns if s.startswith("Nationality")]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
cat_spearman = pd.concat([X_train[["External Funding", "Application mode", "Morning shift participation", "Displaced", "Special needs", "Debtor", "Regularized Fees", "Gender_Male", "International"]], y_train], axis = 1).corr(method = 'spearman')
cor_heatmap(cat_spearman)

In [None]:
ls = X_train.columns
for i in ls:
    print(i)

| Predictor                    | Variance | Spearman | Kendall  | ANOVA   | Solution |
|------------------------------|----------|----------|----------|---------|----------|
| Application order            | Keep     | Keep     | Keep     | Discard | Keep     |
| Previous qualification score | Keep     | Keep     | Keep     | Discard | Keep     |
| Entry score                  | Keep     | Keep     | Keep     | Discard | Keep     |
| Age at enrollment            | Keep     | Keep     | Keep     | Keep    | Keep     |
| N units credited 1st period  | Keep     | Keep     | Keep     | Discard | Keep     |
| N units taken 1st period     | Keep     | Keep     | Keep     | Discard | Keep     |
| N scored units 1st period    | Keep     | Keep     | Keep     | Discard | Keep     |
| N units approved 1st period  | Keep     | Keep     | Keep     | Keep    | Keep     |
| Average grade 1st period     | Keep     | Keep     | Keep     | Keep    | Keep     |
| N unscored units 1st period  | Keep     | Keep     | Keep     | Discard | Keep     |
| N units credited 2nd period  | Keep     | Discard  | Discard  | Discard | Discard  |
| N units taken 2nd period     | Keep     | Discard  | Discard  | Discard | Discard  |
| N scored units 2nd period    | Keep     | Keep     | Keep     | Discard | Keep     |
| N units approved 2nd period  | Keep     | Discard  | Discard? | Keep    | Test     |
| Average grade 2nd period     | Keep     | Discard? | Keep     | Keep    | Test     |
| N unscored units 2nd period  | Keep     | Keep     | Keep     | Discard | Keep     |
| Social Popularity            | Keep     | Keep     | Keep     | Discard | Keep     |

# Model

In [None]:
parameter_space_lr = {
    'penalty': ["l2", "elasticnet", None],
    'solver': ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    'class_weight': [None, 'balanced'],
    'C' : [0.2, 0.5, 0.7, 1], # inverse regularization parameter
}

In [None]:
#running this will give some warnings when convergence is not reached- run the code below to have them not show up
import warnings
warnings.filterwarnings('ignore')

# clf = GridSearchCV(lr, parameter_space_lr, scoring = 'f1', cv = 10,)
# clf.fit(X_train,y_train)

# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
# print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')


In [22]:
model = RandomForestClassifier(800)
model.fit(X_train, y)

In [23]:
test_pred = model.predict(testdata) #test preds

In [None]:
index = testdata.index.T
answer = pd.DataFrame([index, test_pred]).T
answer.columns = ['Userid','Success']
answer.replace({'Success': {0: 'Gave Up', 1: 'Holding on', 2: 'Succeeded'}}, inplace = True)
answer.set_index('Userid').to_csv('answer.csv')

---

In [32]:
#import the data and drop remove useless stuff
traindata: pd.DataFrame = pd.read_csv(path + 'project_data/train.csv').set_index('Userid').drop('Observations', axis=1).drop_duplicates()
testdata: pd.DataFrame = pd.read_csv(path + 'project_data/test.csv').set_index('Userid').drop(['Registered', 'Observations'], axis=1)

traindata = traindata[traindata['Registered'] == 'Yes']
traindata = traindata.drop('Registered', axis=1)

In [33]:
X: pd.DataFrame = traindata.drop(['Success'], axis = 1)
y: pd.Series = traindata['Success']

In [34]:
def fillNa(data: pd.DataFrame) -> pd.DataFrame:
    """Fill missing values

    Args:
        data (`pd.DataFrame`): Dataframe to be treated

    Returns:
        `pd.DataFrame`: Treated dataframe
    """    

    # on all of these features, if a value were to be different than 0, then it would not be missing
    ifNaThen0: tuple[str,...] = (
        "N units credited 1st period",
        "N unscored units 1st period",
        "N units approved 1st period",
        "N units credited 2nd period",
        "N unscored units 2nd period",
        "N units approved 2nd period"
    )

    # these features are filled differently, basically incoherence checking, but filling the Na on 'N units approved 1st/2nd period' is needed beforehand, more info below
    checkAfterVars: list[list[str,str]] = [
        ["N units taken 1st period", "N units approved 1st period"],
        ["N units taken 2nd period", "N units approved 2nd period"]
    ] 

    for var in metricFeatures:
        if var == (checkAfterVars[0][0] or checkAfterVars[1][1]): continue # skip current iteration

        if var in ifNaThen0:
            data[var] = data[var].fillna(0) # fill the ifNaThen0 vars with well, 0s
        else:    
            data[var] = data[var].fillna(data[var].median()) # fill everything else with the median of the values of the feature

    for varList in checkAfterVars:
        # search for Na values on N units taken and replace by the equivalent value on N units approved
        data.loc[data[varList[0]].isna(), varList[0]] = data[varList[1]]
        # search for values on N units taken that are smaller than the equivalent on N units approved, replace by the equivalent value on N units approved
        data.loc[data[varList[0]] < data[varList[1]], varList[0]] = data[varList[1]]

    for var in boolFeatures:
        if var == "Regularized Fees":
            data[var] = data[var].fillna(1) # if nothing is said about the fees, we can assume they have been paid
        else:
            data[var] = data[var].fillna(0) # here is like the ifNaThen0 situation, if the values were to not be 0, they would have been declared

    return data

In [35]:
def removeOutliers(dataX: pd.DataFrame, datay: pd.DataFrame) -> pd.DataFrame:
    """Removes outliers and fixes any negative number incoherences on the selected variables from the dataframe

    Args:
        data (`pd.DataFrame`): Dataframe to be treated

    Returns:
        `pd.DataFrame`: Treated dataframe
    """    

    toBeTreated: dict[str, dict[str, float | None]] = {
        'Previous qualification score': {"lower": 0, "upper": None},
        'Entry score': {"lower": 0, "upper": None},
        'N units credited 1st period': {"lower": 0, "upper": 15},
        'N units taken 1st period': {"lower": 0, "upper": 20},
        'N scored units 1st period': {"lower": 0, "upper": 25},
        'N units approved 1st period': {"lower": 0, "upper": 20},
        'Average grade 1st period': {"lower": 0, "upper": None},
        'N units credited 2nd period': {"lower": 0, "upper": 14},
        'N units taken 2nd period': {"lower": 0, "upper": 15},
        'N scored units 2nd period': {"lower": 0, "upper": 25},
        'N units approved 2nd period': {"lower": 0, "upper": 15},
        'Average grade 2nd period': {"lower": 0, "upper": None}
    }
    
    for var in toBeTreated:
        if toBeTreated[var]["lower"] is not None:
            toRemove: list = list(dataX.loc[dataX[var] < toBeTreated[var]["lower"], var].index)
        if toBeTreated[var]["upper"] is not None:
            toRemove.extend(list(dataX.loc[dataX[var] > toBeTreated[var]["upper"], var].index))
        dataX.drop(toRemove, axis=0, inplace=True)
        datay.drop(toRemove, axis=0, inplace=True)

    return dataX, datay

In [None]:
fillNa(X)
fillNa(testdata)
removeOutliers(X, y)

In [37]:
# replace values on columns that have lots of different values that can be grouped together to reduce the total number of dummies created after

for col in ["Mother's qualification",  "Father's qualification", "Previous qualification"]:
    X.replace(regex={col: {r"(?i)^no school.*$": '0',
                           r"(?i)^[0-4][a-z]{2} grade.*$": '1', 
                           r"(?i)^[5-9]th grade.*$": '2', 
                           r"(?i)^1[0-2]th grade.*$": '3', 
                           r"(?i)^incomplete bachelor.*$": '4', 
                           r"(?i)^bachelor degree.*$": '5',
                           r"(?i)^post-grad.*$": '6',
                           r"(?i)^master degree.*$": '7',
                           r"(?i)^phd.*$": '8',}}, inplace=True)
    
    testdata.replace(regex={col: {r"(?i)^no school.*$": '0',
                           r"(?i)^[0-4][a-z]{2} grade.*$": '1', 
                           r"(?i)^[5-9]th grade.*$": '2', 
                           r"(?i)^1[0-2]th grade.*$": '3', 
                           r"(?i)^incomplete bachelor.*$": '4', 
                           r"(?i)^bachelor degree.*$": '5',
                           r"(?i)^post-grad.*$": '6',
                           r"(?i)^master degree.*$": '7',
                           r"(?i)^phd.*$": '8',}}, inplace=True)

for col in ["Mother's occupation", "Father's occupation"]:
    X.replace(to_replace={col: ["Superior-level Professional", "Intermediate-level Professional", "Politician/CEO", "Teacher", "Information Technology Specialist"]}, value="Professional Fields", inplace=True)
    X.replace(to_replace={col: ["Skilled construction workers", "Assembly Worker", "Factory worker", "Lab Technocian"]}, value="Technical and Skilled Trades", inplace=True)
    X.replace(to_replace={col: ["Administrative Staff", "Office worker", "Accounting operator"]}, value="White collar Jobs", inplace=True)
    X.replace(to_replace={col: ["Restaurant worker", "Personal care worker", "Seller", "Cleaning worker"]}, value="Service Industry", inplace=True)
    X.replace(to_replace={col: ["Private Security", "Armed Forces"]}, value="Security and Armed Forces", inplace=True)
    X.replace(to_replace={col: ["Unskilled Worker", "Other", "Student", "Artist"]}, value="Recreational or unskilled", inplace=True)
    X.replace(to_replace={col: ["Engineer", "Scientist", "Health professional"]}, value="STEM Jobs", inplace=True)

    testdata.replace(to_replace={col: ["Superior-level Professional", "Intermediate-level Professional", "Politician/CEO", "Teacher", "Information Technology Specialist"]}, value="Professional Fields", inplace=True)
    testdata.replace(to_replace={col: ["Skilled construction workers", "Assembly Worker", "Factory worker", "Lab Technocian"]}, value="Technical and Skilled Trades", inplace=True)
    testdata.replace(to_replace={col: ["Administrative Staff", "Office worker", "Accounting operator"]}, value="White collar Jobs", inplace=True)
    testdata.replace(to_replace={col: ["Restaurant worker", "Personal care worker", "Seller", "Cleaning worker"]}, value="Service Industry", inplace=True)
    testdata.replace(to_replace={col: ["Private Security", "Armed Forces"]}, value="Security and Armed Forces", inplace=True)
    testdata.replace(to_replace={col: ["Unskilled Worker", "Other", "Student", "Artist"]}, value="Recreational or unskilled", inplace=True)
    testdata.replace(to_replace={col: ["Engineer", "Scientist", "Health professional"]}, value="STEM Jobs", inplace=True)

X.replace(to_replace={"Marital status": {"facto union": "married",
                            "legally separated": "divorced",
                            # "widower": "single" reduces performance
                            }}, inplace=True)

testdata.replace(to_replace={"Marital status": {"facto union": "married",
                            "legally separated": "divorced",
                            # "widower": "single" reduces performance
                            }}, inplace=True)

In [38]:
# get dummies, add 0s to columns that are in test but not in train, and vice-versa, reorder the columns so that they are in the same order

X: pd.DataFrame = pd.get_dummies(data=X, prefix_sep="-", dummy_na=True, drop_first=False)
testdata: pd.DataFrame = pd.get_dummies(data=testdata, prefix_sep="-", dummy_na=True, drop_first=False)

X[(coldiff1 := list(set(testdata.columns) - set(X.columns)))] = 0
testdata[(coldiff2 := list(set(X.columns) - set(testdata.columns)))] = 0

testdata = testdata.reindex(columns=X.columns)

# dont forget lambda here

In [28]:
X, testdata = list(map(lambda x: fillNa(x), [X, testdata]))

In [44]:
X_train: pd.DataFrame 
X_val: pd.DataFrame 
y_train: pd.DataFrame 
y_val: pd.DataFrame 
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=15)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_trainScaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns, index = X_train.index)
X_valScaled = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns, index = X_val.index)
testdataScaled = pd.DataFrame(scaler.transform(testdata), columns = testdata.columns, index = testdata.index)

In [46]:
model = RandomForestClassifier(800)
parameter_space = {
    "n_estimators": [100,200,300,400,500,600,700,800,900,1000]
    }

gs = GridSearchCV(model, parameter_space, scoring = 'f1_weighted', cv = 10, verbose=3)
gs.fit(X_train,y_train)

print(gs.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END .................n_estimators=100;, score=0.718 total time=   0.5s
[CV 2/10] END .................n_estimators=100;, score=0.763 total time=   0.6s
[CV 3/10] END .................n_estimators=100;, score=0.750 total time=   0.5s
[CV 4/10] END .................n_estimators=100;, score=0.708 total time=   0.5s
[CV 5/10] END .................n_estimators=100;, score=0.758 total time=   0.5s
[CV 6/10] END .................n_estimators=100;, score=0.703 total time=   0.4s
[CV 7/10] END .................n_estimators=100;, score=0.708 total time=   0.5s
[CV 8/10] END .................n_estimators=100;, score=0.739 total time=   0.6s
[CV 9/10] END .................n_estimators=100;, score=0.717 total time=   0.4s
[CV 10/10] END ................n_estimators=100;, score=0.714 total time=   0.4s
[CV 1/10] END .................n_estimators=200;, score=0.734 total time=   1.0s
[CV 2/10] END .................n_estimators=20

KeyboardInterrupt: 

In [None]:
# Run a model to get the features with least importance
model: RandomForestClassifier = RandomForestClassifier(800)
model.fit(X, y)

predictions = model.predict(testdata)

for x, y in zip(list(X.columns), model.feature_importances_):
    if y*100 < 0.5:
        print(f"{x}: {y*100:.2f}")

In [30]:
# Get predictions
model: RandomForestClassifier = RandomForestClassifier(800, random_state=15)
model.fit(X, y)

predictions = model.predict(testdata)

for x, y in zip(list(X.columns), model.feature_importances_):
    if y*100 < 0.5:
        print(f"{x}: {y*100:.2f}")

outputData = pd.DataFrame([testdata.index, predictions]).T;
outputData.columns = ["Userid", "Success"];

outputData.to_csv("./answer.csv", index=False);

Morning shift participation: 0.39
Special needs: 0.10
International: 0.18
Marital status-divorced: 0.13
Marital status-married: 0.28
Marital status-single: 0.31
Marital status-widower: 0.03
Marital status-nan: 0.00
Course-Advertising and Marketing Management: 0.36
Course-Agronomy: 0.20
Course-Animation and Multimedia Design: 0.47
Course-Biofuel Production Technologies: 0.04
Course-Communication Design: 0.28
Course-Equinculture: 0.25
Course-Informatics Engineering: 0.42
Course-Journalism and Communication: 0.36
Course-Management: 0.43
Course-Management (evening attendance): 0.28
Course-Oral Hygiene: 0.14
Course-Social Service (evening attendance): 0.20
Course-Tourism: 0.33
Course-Veterinary Nursing: 0.43
Course-nan: 0.00
Previous qualification-2: 0.19
Previous qualification-3: 0.39
Previous qualification-4: 0.03
Previous qualification-5: 0.19
Previous qualification-6: 0.06
Previous qualification-7: 0.01
Previous qualification-nan: 0.01
Nationality-Angolan: 0.02
Nationality-Brazilian: 0.

In [None]:
# Split

In [None]:
import math, sys, numpy as np, pandas as pd
from typing import Literal;
from sklearn import preprocessing;
from sklearn.ensemble import RandomForestClassifier;

def encodeCols(trainData: pd.DataFrame, testData: pd.DataFrame, columns: list[str], /):
  """Encode categorical columns to numerical.

  Args:
      trainData (pd.DataFrame): The train dataframe where conversion is to be performed.
      testData (pd.DataFrame): The test dataframe where conversion is to be performed.
      columns (list[str]): The columns to be converted.
  """;

  for (col) in columns:
    le = preprocessing.LabelEncoder();
    dfCombined = le.fit_transform(pd.concat([trainData[col], testData[col]], axis=0));

    trainLength = trainData[col].__len__();

    # dfCombined.

    trainData[col] = dfCombined[:trainLength];
    testData[col] = dfCombined[trainLength:];
  # end for
# end encodeCols

def importData(trainPath: str = "./data/train.csv", testPath: str = "./data/test.csv") -> tuple[ tuple[pd.DataFrame, pd.Series], pd.DataFrame ]:
  """Imports the training and testing dataset and returns a training feature/target tuple and the testing features.

  Args:
    `trainPath` (`str`): the path (relative or absolute) to the training data csv.
    `testPath` (`str`): the path (relative or absolute) to the testing data csv.

  Returns:
    `tuple[tuple[pd.DataFrame, pd.Series], pd.DataFrame]` - `( (trainX, trainY), testX )`
  """;

  userIdArray: list[Literal["Userid"]] = ["Userid"];

  trainData: pd.DataFrame = pd.read_csv(trainPath, sep=",").drop("Observations", axis=1).drop_duplicates().set_index("Userid");
  # pd.DataFrame.it.drop_duplicates() only removes rows where all values are the same; does not drop if different values but same userId.

  testX: pd.DataFrame = pd.read_csv(testPath, sep=",").drop("Observations", axis=1).set_index(userIdArray);
  if (testX.duplicates().__len__() > 0): raise ValueError("Test data contains duplicate user IDss.");

  trainY: pd.Series = trainData["Success"];
  trainX: pd.Series = trainData.drop(["Success"], axis = 1);

  del trainData, userIdArray;

  return (
    (trainX, trainY),
    testX
  );
#end importData

def main() -> int:
  userIdArray: list[Literal["Userid"]] = ["Userid"];
  trainData: pd.DataFrame = pd.read_csv(path + "project_data/train.csv", sep=",").drop("Observations", axis=1).set_index(userIdArray);
  testData: pd.DataFrame = pd.read_csv(path + "project_data/test.csv", sep=",").drop("Observations", axis=1).set_index(userIdArray);

  del userIdArray;

  # trainData.index = testData.index = ["Userid"];

  # trainData.groupby("Userid");

  # rows without a "registered" value in the dataset are garbage (userId & outcome only, no use for ML algo).
  trainData.drop(trainData[trainData["Registered"] != "Yes"].index, inplace=True);

  trainData = trainData.drop("Registered", axis=1);
  testData = testData.drop("Registered", axis=1);


  encodeCols(trainData, testData, [
    "Marital status",
    "Course",
    "Previous qualification",
    "Nationality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation"
  ]);

  # drop = [
  #   "Nationality",
  #   "International",
  #   "Special needs",
  #   "Displaced",
  #   "Morning shift participation",
  #   "Previous qualification",
  #   "Marital status",
  #   'Application mode',
  #   'Application order',
  #   'Course',
  #   'Previous qualification score',
  #   "Mother's qualification",
  #   "Father's qualification",
  #   "Mother's occupation",
  #   "Father's occupation",
  #   #'Entry score',
  #   'Debtor',
  #   'Regularized Fees',
  #   'Gender_Male',
  #   'External Funding',
  #   'Age at enrollment',
  #   'N units credited 1st period',
  #   'N units taken 1st period',
  #   'N scored units 1st period',
  #   #'N units approved 1st period',
  #   #'Average grade 1st period',
  #   'N unscored units 1st period',
  #   'N units credited 2nd period',
  #   'N units taken 2nd period',
  #   'N scored units 2nd period',
  #   #'N units approved 2nd period',
  #   #'Average grade 2nd period',
  #   'N unscored units 2nd period',
  #   #'Social Popularity'
  # ];

  # trainData = trainData.drop(drop, axis=1);
  # testData = testData.drop(drop, axis=1);


  trainX: pd.DataFrame = trainData.drop("Success", axis=1);
  trainY: pd.Series = trainData["Success"];

  model: RandomForestClassifier = RandomForestClassifier(800);
  model.fit(trainX, trainY);

  predictions = model.predict(testData);

  for x, y in zip(list(trainX.columns), model.feature_importances_):
    print(f"{x}: {y*100:.2f}");

  outputData = pd.DataFrame([testData.index, predictions]).T;
  outputData.columns = ["Userid", "Success"];

  outputData.to_csv("./answer.csv", index=False);

  return 0;
# end main
main()