In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df =  pd.read_csv('SUB_WEEKLY_FINAL_1.csv')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df1 =  pd.read_csv('BalanceOutcomes.csv')
df1 = df1.rename(columns={'Part': 'ID'})
pd.set_option("display.max_columns", None)
df1.head(5)

In [None]:
import pandas as pd
from functools import reduce

# Clean Task and Qual to avoid hidden issues
df1['Task'] = df1['Task'].str.strip()
df1['Qual'] = df1['Qual'].str.strip()
df1['ID'] = df1['ID'].astype(int)

# Define the weeks to extract
weeks_to_extract = [0, 6, 12, 16]

# Container for pivoted data
pivoted_psr = []

for week in weeks_to_extract:
    temp = df1[df1['Week'] == week].copy()
    
    # Pivot: one row per ID, one column per Task+Qual
    pivot = temp.pivot_table(index='ID', columns=['Task', 'Qual'], values='PS R')
    
    # Rename columns to indicate the week
    pivot.columns = [f'PSR_W{week}_{task}_{qual}' for task, qual in pivot.columns]
    
    # Add to list
    pivoted_psr.append(pivot)

# Merge all weeks using OUTER join to retain all IDs
psr_combined = reduce(
    lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'),
    pivoted_psr
)

# Optional: reset index if you want 'ID' as a column
psr_combined.reset_index(inplace=True)

# Optional: fill missing values (if you want to treat missing scores as 0)
# psr_combined.fillna(0, inplace=True)

# ‚úÖ Done
psr_combined.head()
print(list(psr_combined))

In [None]:
df3 =  pd.read_csv('GaitOutcomes.csv')
df3 = df3.rename(columns={'Part': 'ID'})
pd.set_option("display.max_columns", None)
df3.head(5)

In [None]:
import pandas as pd
from functools import reduce

# Clean Task, Qual, and ID just once
df3['Task'] = df3['Task'].str.strip()
df3['Qual'] = df3['Qual'].str.strip()
df3['ID'] = df3['ID'].astype(int)

# Define weeks and gait-related variables
weeks_to_extract = [0, 6, 12, 16]

gait_variables = [
    'Gait symmetry', 'Step length', 'Step length left', 'Step length right',
    'Step time', 'Step time left', 'Step time right',
    'Step length var', 'Step time var',
    'Step length asym', 'Step time asym',
    'Step velocity',
    'Step count lap 1', 'Step count lap 2', 'Step count lap 3', 'Step count lap 4'
]

# Master container for all pivoted variables
all_pivoted = []

for var in gait_variables:
    pivoted_list = []
    for week in weeks_to_extract:
        temp = df3[df3['Week'] == week].copy()
        pivot = temp.pivot_table(index='ID', columns=['Task', 'Qual'], values=var)
        
        # Rename columns to reflect variable and week
        pivot.columns = [f'{var.replace(" ", "").replace("-", "").replace("/", "")}_W{week}_{task}_{qual}' for task, qual in pivot.columns]
        pivoted_list.append(pivot)
    
    # Merge all weeks for this variable
    merged_var = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'), pivoted_list)
    
    all_pivoted.append(merged_var)

# Merge all variables together on ID
final_combined_gait = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'), all_pivoted)

# Optional: reset index
final_combined_gait.reset_index(inplace=True)

# ‚úÖ Preview the result
final_combined_gait.head()
#print(list(final_combined_gait))

In [None]:
# Step 1: Merge df with psr_combined
merged1 = df[['ID', 'FIRST', 'MID', 'LAST', 'SLOPE1', 'SLOPE2', 'AGE', 'GROUP']].copy()
merged1 = merged1.merge(psr_combined, on='ID', how='left')

# Step 2: Merge with final_combined_gait
final_merged = merged1.merge(final_combined_gait, on='ID', how='left')

# Optional: check shape and preview
print("‚úÖ Final merged shape:", final_merged.shape)
final_merged.head()
print(list(final_merged))

In [None]:
# Identify all columns
all_cols = final_merged.columns

# Step 1: Gait symmetry features (excluding the target)
gait_cols = [col for col in all_cols 
             if col.startswith('Gaitsymmetry_') 
             and any(w in col for w in ['_W0_', '_W6_', '_W12_'])
             and col != 'Gaitsymmetry_W12_Walk_HT']  # Exclude dependent var




# Step 2: Step velocity features
velocity_cols = [col for col in all_cols 
                 if col.startswith('Stepvelocity_') 
                 and any(w in col for w in ['_W0_', '_W6_', '_W12_'])]



# Step 3: Other predictors
other_predictors = ['FIRST', 'SLOPE1', 'MID', 'SLOPE2', 'LAST', 'AGE', 'GROUP']

# Combine all predictors
predictors = gait_cols + velocity_cols + other_predictors

# Step 4: Final dataset
df_gait = final_merged[predictors + ['Gaitsymmetry_W12_Walk_HT']].dropna()

# One-hot encode categorical
df_encoded = pd.get_dummies(df_gait, columns=['AGE', 'GROUP'], drop_first=True)

# X and y
X = df_encoded.drop(columns=['Gaitsymmetry_W12_Walk_HT']) #REMOVE DEPENDANT VARIABLE 
y = df_encoded['Gaitsymmetry_W12_Walk_HT'] # CHANGE DEPENDANT VARIABLE


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Step 5: Best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("üîç Best Parameters:", grid_search.best_params_)
print(f"‚úÖ R¬≤: {r2_score(y_test, y_pred):.4f}")
print(f"‚úÖ RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

# Step 6: Cross-validation
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='r2')
print("üìä Cross-Validated R¬≤ Scores:", np.round(cv_scores, 4))
print(f"‚úÖ Average CV R¬≤: {np.mean(cv_scores):.4f}")

In [None]:
# Get and plot feature importance
importances = best_rf.feature_importances_
feat_names = X.columns

feat_imp_df = pd.DataFrame({
    'Feature': feat_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot top 20
plt.figure(figsize=(10, 6))
plt.barh(feat_imp_df['Feature'][:20], feat_imp_df['Importance'][:20])
plt.gca().invert_yaxis()
plt.title('Top 20 Feature Importances - Random Forest')
plt.xlabel('Importance')
plt.tight_layout()
plt.grid(True)
plt.show()


In [None]:
import scipy.stats as stats

# Residuals
residuals = y_test - y_pred

# üìà Residuals vs Predicted
plt.figure(figsize=(8, 5))
plt.scatter(y_pred, residuals, alpha=0.7)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()

# üìä Histogram of Residuals
plt.figure(figsize=(8, 4))
plt.hist(residuals, bins=30, edgecolor='k', alpha=0.7)
plt.title("Histogram of Residuals")
plt.xlabel("Residual")
plt.tight_layout()
plt.show()

# üìâ QQ Plot
plt.figure(figsize=(6, 6))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("QQ Plot")
plt.tight_layout()
plt.show()
