In [None]:
import pandas as pd

def time_to_minutes(t):
    
    try:
        h, m, s = map(int, t.split(':'))
        return h * 60 + m + s / 60
    except:
        return None

df['Mark_numeric'] = df['Mark'].apply(time_to_minutes)
df = df.dropna(subset=['Mark_numeric', 'Age'])


# Γραμμική Παλινδρόμηση
import statsmodels.formula.api as smf

model = smf.ols("Mark_numeric ~ Age + C(Sex) + C(COVID_STATUS)", data=df).fit()
print(model.summary())



# T-Test
pre_covid = df[df['COVID_STATUS'] == 'Before COVID']['Mark_numeric']
post_covid = df[df['COVID_STATUS'] == 'After COVID']['Mark_numeric']


from scipy.stats import ttest_ind

stat, p = ttest_ind(pre_covid, post_covid, nan_policy='omit')
print(f"T-test (Pre-COVID vs Post-COVID) p-value: {p:.4f}")

if p < 0.05:
    print("Υπάρχει στατιστικά σημαντική διαφορά πριν και μετά τον COVID.")
else:
    print("Δεν υπάρχει στατιστικά σημαντική διαφορά πριν και μετά τον COVID.")

# ANOVA + Tukey
from scipy.stats import f_oneway

groups = [g['Mark_numeric'].dropna() for _, g in df.groupby('COVID_STATUS')]
stat, p = f_oneway(*groups)
print(f"ANOVA (COVID periods) p-value: {p:.4f}")

from statsmodels.stats.multicomp import pairwise_tukeyhsd


df_clean = df.dropna(subset=['Mark_numeric', 'COVID_STATUS']) 


tukey = pairwise_tukeyhsd(endog=df_clean['Mark_numeric'],
                          groups=df_clean['COVID_STATUS'],
                          alpha=0.05)

print(tukey)


# Random Forest + Σημαντικότητα Μεταβλητών

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

df_ml = df[['Age', 'Sex', 'COVID_STATUS', 'Mark_numeric']].dropna()
df_ml = pd.get_dummies(df_ml, columns=['Sex', 'COVID_STATUS'], drop_first=True)

X = df_ml.drop(columns='Mark_numeric')
y = df_ml['Mark_numeric']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f} minutes")
print(f"R² score: {r2_score(y_test, y_pred):.2f}")

import matplotlib.pyplot as plt
import pandas as pd




importances = model.feature_importances_


feature_names = X.columns
feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})


feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)


print(feat_imp_df)


plt.figure(figsize=(8,5))
plt.bar(feat_imp_df['Feature'], feat_imp_df['Importance'], color='skyblue')
plt.title('Feature Importance στο Random Forest Model')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.xticks(rotation=45)
plt.show()





