In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize


import warnings
warnings.filterwarnings("ignore")

In [None]:
params = { 
    'for_ml' : 'processed/ambitus_0_15_ml_ready_24_07_2025.csv',
    'str_cols' : ['Group', 'Gender', 'Season'],
    'not_for_ml' : ['Group', 'Gender', 'Season', 'NR', 'Year', 'Paradigm', 'id', 'Generation'],
    'not_corr_cols' : ['Group', 'Gender', 'Season', 'NR', 'Year', 'Paradigm', 'id', 'Generation', 'Group'],
    'generation_cut' : 13,
    'generation_under_cut' : 6,
    'seed' : 42
}

In [None]:
#Normalizeing the data with each generation separately
data_normalized = data.copy()
for gen in data['Generation'].unique():
    gen_data = data[data['Generation'] == gen]
    numeric_cols = [col for col in gen_data.columns if col not in params['not_for_ml'] and gen_data[col].dtype in ['float64', 'int64']]
    data_normalized.loc[data['Generation'] == gen, numeric_cols] = normalize(gen_data[numeric_cols])
data_normalized

In [None]:
#Creating train and test sets for machine learning
X_train = data.loc[(data['Generation'] <= params['generation_cut']) & (data['Generation'] > params['generation_under_cut'])].drop(columns=params['not_for_ml'])
X_test = data.loc[data['Generation'] > params['generation_cut']].drop(columns=params['not_for_ml'])
y_train = data.loc[(data['Generation'] <= params['generation_cut']) & (data['Generation'] > params['generation_under_cut']), 'Group']
y_test = data.loc[data['Generation'] > params['generation_cut'], 'Group']

In [None]:
#Plotting label distribution in train and test sets

sns.histplot(pd.DataFrame(y_train), x='Group', multiple='dodge', shrink=0.8)
plt.title('Distribution of Group at Train Set', fontsize=16)
plt.show()


In [None]:
#Creating pipeline
pipeline = Pipeline([('tree', DecisionTreeClassifier(random_state=params['seed'], max_depth=5, min_samples_leaf=6))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Plotting the confusion matrix
cm = confusion_matrix(y_test, y_pred)


sns.heatmap(cm, annot=True)
plt.title('Confusion Matrix', fontsize=16)
plt.show()

In [None]:
#Feature importance from the Decision Tree
importances = pipeline.named_steps['tree'].feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False).loc[importance_df['Importance'] > 0.00001].head(10)  # Filter out low importance features

sns.barplot(importance_df, x='Importance', y='Feature', orient='h')
plt.title('Feature Importance', fontsize=16)
plt.show()
