In [120]:
import pandas as pd
import os


In [121]:
import os
import pandas as pd

def load_data_from_folders(folder_names, participant_file_prefix, ecg_file_name, eda_file_name):
    all_participants = []
    all_ecg = []
    all_eda = []

    for folder in folder_names:
        # Load participant data
        participant_files = [f for f in os.listdir(folder) if f.startswith(participant_file_prefix)]
        for file in participant_files:
            df_participant = pd.read_csv(os.path.join(folder, file))
            all_participants.append(df_participant)

        # Load ECG data
        ecg_file_path = os.path.join(folder, ecg_file_name)
        if os.path.exists(ecg_file_path):
            df_ecg = pd.read_csv(ecg_file_path)
            all_ecg.append(df_ecg)

        # Load EDA data
        eda_file_path = os.path.join(folder, eda_file_name)
        if os.path.exists(eda_file_path):
            df_eda = pd.read_csv(eda_file_path)
            all_eda.append(df_eda)

    # Concatenate all dataframes, ensuring consistent columns
    df_participants_combined = pd.concat(all_participants, ignore_index=True).reindex(columns=all_participants[0].columns)
    df_ecg_combined = pd.concat(all_ecg, ignore_index=True).reindex(columns=all_ecg[0].columns)
    df_eda_combined = pd.concat(all_eda, ignore_index=True).reindex(columns=all_eda[0].columns)

    return df_participants_combined, df_ecg_combined, df_eda_combined

folder_names = ['1', '2', '3', '4', '5', '6', '7']
participant_file_prefix = 'all_apps_wide'
ecg_file_name = 'ecg_results.csv'
eda_file_name = 'eda_results.csv'

df_participants_combined, df_ecg_combined, df_eda_combined = load_data_from_folders(folder_names, participant_file_prefix, ecg_file_name, eda_file_name)

participant_id = 'ae2kja5u'
windows_to_cut = ['Window 4', 'Window 5', 'Window 6', 'Window 7', 'Window 8', 'Window 9', 'Window 10']

df_ecg_combined = df_ecg_combined[~((df_ecg_combined['Participant'] == participant_id) & (df_ecg_combined['Type'].isin(windows_to_cut)))]

df_ecg_combined
print(f'Combined participant dataframe shape: {df_participants_combined.shape}')
print(f'Combined ECG dataframe shape: {df_ecg_combined.shape}')
print(f'Combined EDA dataframe shape: {df_eda_combined.shape}')


Combined participant dataframe shape: (53, 1949)
Combined ECG dataframe shape: (280, 74)
Combined EDA dataframe shape: (280, 6)


In [122]:
df_participants_combined

Unnamed: 0,participant.id_in_session,participant.code,participant.label,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,...,debriefing.1.player.maia31,debriefing.1.player.maia32,debriefing.1.player.maia33,debriefing.1.player.maia34,debriefing.1.player.maia35,debriefing.1.player.maia36,debriefing.1.player.maia37,debriefing.1.player.maia38c,debriefing.1.group.id_in_subsession,debriefing.1.subsession.round_number
0,1,avanznds,,0,507,507,debriefing,ThankYou,2024-05-07 11:17:15.193596,1,...,6,4,4,4,6,6,6,6,1,1
1,2,ae2kja5u,,0,507,507,debriefing,ThankYou,2024-05-07 11:20:19.344607,1,...,2,5,3,4,5,6,6,6,1,1
2,3,zzq192rp,,0,507,507,debriefing,ThankYou,2024-05-07 11:21:55.236415,1,...,2,2,1,1,2,2,4,6,1,1
3,4,vrc5kdqt,,0,507,507,debriefing,ThankYou,2024-05-07 11:22:25.295811,1,...,4,5,5,4,4,4,4,6,1,1
4,5,4pzbw8ti,,0,507,507,debriefing,ThankYou,2024-05-07 11:22:47.564184,1,...,6,6,5,6,6,6,6,6,1,1
5,6,pr8gga0e,,0,507,507,debriefing,ThankYou,2024-05-07 11:23:28.793476,1,...,2,2,2,4,4,5,4,6,1,1
6,7,05qhlznq,,0,507,507,debriefing,ThankYou,2024-05-07 11:24:03.414831,1,...,1,1,1,1,4,4,5,6,1,1
7,8,3dvp3eki,,0,507,507,debriefing,ThankYou,2024-05-07 11:24:30.438646,1,...,4,4,3,4,4,4,4,6,1,1
8,1,s718hh4o,,0,505,505,debriefing,ThankYou,2024-03-12 12:27:46.807348,1,...,2,3,1,5,4,6,6,6,1,1
9,2,d6trggw6,,0,505,505,debriefing,ThankYou,2024-03-12 12:28:15.549715,1,...,6,6,1,6,6,6,6,6,1,1


In [123]:
df_participants_combined['stockmarket.40.player.final_score']

0     291.06
1     329.92
2     267.02
3     245.93
4     414.34
5     349.12
6     291.75
7     283.39
8     538.40
9     203.65
10    387.87
11    245.96
12    116.94
13    390.65
14    557.13
15    150.97
16    173.90
17    189.45
18    258.93
19    376.52
20    416.69
21    220.26
22    344.87
23    278.67
24    266.75
25    324.94
26    352.67
27    347.91
28    259.17
29    329.26
30    232.34
31    364.12
32    232.73
33    227.49
34    281.33
35    237.10
36    251.50
37    207.88
38    169.72
39    527.41
40    176.91
41    449.99
42    221.17
43    214.38
44    338.56
45    205.48
46    202.19
47    280.70
48    250.69
49    238.95
50    377.75
51    186.24
52    287.84
Name: stockmarket.40.player.final_score, dtype: float64

In [124]:
df_participants_combined

Unnamed: 0,participant.id_in_session,participant.code,participant.label,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,...,debriefing.1.player.maia31,debriefing.1.player.maia32,debriefing.1.player.maia33,debriefing.1.player.maia34,debriefing.1.player.maia35,debriefing.1.player.maia36,debriefing.1.player.maia37,debriefing.1.player.maia38c,debriefing.1.group.id_in_subsession,debriefing.1.subsession.round_number
0,1,avanznds,,0,507,507,debriefing,ThankYou,2024-05-07 11:17:15.193596,1,...,6,4,4,4,6,6,6,6,1,1
1,2,ae2kja5u,,0,507,507,debriefing,ThankYou,2024-05-07 11:20:19.344607,1,...,2,5,3,4,5,6,6,6,1,1
2,3,zzq192rp,,0,507,507,debriefing,ThankYou,2024-05-07 11:21:55.236415,1,...,2,2,1,1,2,2,4,6,1,1
3,4,vrc5kdqt,,0,507,507,debriefing,ThankYou,2024-05-07 11:22:25.295811,1,...,4,5,5,4,4,4,4,6,1,1
4,5,4pzbw8ti,,0,507,507,debriefing,ThankYou,2024-05-07 11:22:47.564184,1,...,6,6,5,6,6,6,6,6,1,1
5,6,pr8gga0e,,0,507,507,debriefing,ThankYou,2024-05-07 11:23:28.793476,1,...,2,2,2,4,4,5,4,6,1,1
6,7,05qhlznq,,0,507,507,debriefing,ThankYou,2024-05-07 11:24:03.414831,1,...,1,1,1,1,4,4,5,6,1,1
7,8,3dvp3eki,,0,507,507,debriefing,ThankYou,2024-05-07 11:24:30.438646,1,...,4,4,3,4,4,4,4,6,1,1
8,1,s718hh4o,,0,505,505,debriefing,ThankYou,2024-03-12 12:27:46.807348,1,...,2,3,1,5,4,6,6,6,1,1
9,2,d6trggw6,,0,505,505,debriefing,ThankYou,2024-03-12 12:28:15.549715,1,...,6,6,1,6,6,6,6,6,1,1


In [125]:
def columns_with_nonzero_non_nan_values(df):
    valid_columns = []
    for col in df.columns:
        if df[col].dropna().ne(0).any():
            valid_columns.append(col)
    return valid_columns

valid_columns = columns_with_nonzero_non_nan_values(df_ecg_combined)
df_ecg_combined = df_ecg_combined[valid_columns]

In [126]:
valid_columns = columns_with_nonzero_non_nan_values(df_ecg_combined)

nan_percentage = df_ecg_combined[valid_columns].isna().mean() * 100

nan_percentage_sorted = nan_percentage.sort_values(ascending=False)

df_ecg_combined.fillna(method='bfill', inplace=True)
nan_percentage = df_ecg_combined[valid_columns].isna().mean() * 100
nan_percentage_sorted

Comparison_HRV_LnHF       13.571429
Comparison_HRV_Prc20NN    13.571429
Comparison_HRV_HFn        13.571429
Comparison_HRV_MeanNN     13.571429
Comparison_HRV_SDNN       13.571429
                            ...    
HRV_MaxNN                  0.000000
HRV_MinNN                  0.000000
HRV_pNN20                  0.000000
HRV_pNN50                  0.000000
Session                    0.000000
Length: 64, dtype: float64

### MERGING

In [127]:
df_ecg_combined['Participant'] = df_ecg_combined['Participant'].astype(str)
df_eda_combined['Participant'] = df_eda_combined['Participant'].astype(str)
df_participants_combined['participant.code'] = df_participants_combined['participant.code'].astype(str)


In [128]:
df_ecg_eda = pd.merge(df_ecg_combined, df_eda_combined, on=['Participant', 'Type', 'Group'], suffixes=('_ecg', '_eda'))
df_ecg_eda
df_ecg_eda.to_csv('df_ecg_eda.csv', index=False)


In [129]:
features = [col for col in df_ecg_eda.columns if col not in ['Session_ecg', 'Session_eda', 'Participant', 'Group', 'Type']]

In [130]:
participants = df_ecg_eda['Participant'].unique()
pivoted_df = df_ecg_eda.pivot(index='Participant', columns='Type', values=features)
pivoted_df

Unnamed: 0_level_0,Session,Session,Session,Session,Session,Session,Session,Session,Session,Session,...,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean,Tonic_Mean
Type,Baseline,Window 1,Window 10,Window 2,Window 3,Window 4,Window 5,Window 6,Window 7,Window 8,...,Window 1,Window 10,Window 2,Window 3,Window 4,Window 5,Window 6,Window 7,Window 8,Window 9
Participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0qmpak73,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,...,65533.3,,65533.5,65533.4,65533.2,65533.2,65533.3,65533.1,65533.3,
12jbb73p,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,,,,,...,851.493,,381.259,168.157,267.522,,,,,
1qwm4iwc,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,,,...,3346.05,,3401.68,4124.02,4755.39,5528.42,5242.02,,,
3dvp3eki,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,...,53935.0,48399.2,40354.7,42373.9,53207.0,48778.5,51824.2,49711.7,45641.8,46769.8
43kikdm9,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,,,,,...,65532.7,,65532.8,65532.8,65532.7,,,,,
4pzbw8ti,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,...,1829.96,1695.82,1874.74,1741.27,1742.2,1803.81,1894.89,1809.72,1852.82,1737.36
65l2486h,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,,,,...,435.64,,159.429,151.393,128.091,208.635,,,,
8ca9ujvu,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,,,,,...,9076.7,,9688.85,8057.68,6679.84,,,,,
8peubku6,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,...,32620.6,,29342.8,34967.0,29658.8,28147.3,31254.1,27527.7,26026.9,25827.1
9csokkab,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,...,24890.0,,31101.6,45684.3,33441.4,22922.9,18985.4,16992.6,20173.6,


In [131]:
df_ecg_combined['Window_Number'] = df_ecg_combined['Type'].str.extract('(\d+)').astype(float)
highest_window_per_participant = df_ecg_combined.groupby('Participant')['Window_Number'].max().reset_index()
highest_window_per_participant

Unnamed: 0,Participant,Window_Number
0,0qmpak73,8.0
1,12jbb73p,4.0
2,1qwm4iwc,6.0
3,3dvp3eki,10.0
4,43kikdm9,4.0
5,4pzbw8ti,10.0
6,65l2486h,5.0
7,8ca9ujvu,4.0
8,8peubku6,9.0
9,9csokkab,8.0


In [132]:
pivoted_df.columns = ['_'.join(col).strip() for col in pivoted_df.columns.values]

In [133]:
pivoted_df.reset_index(inplace=True)

In [134]:
columns_to_keep = [col for col in pivoted_df.columns if not any(w in col for w in ['Window 7', 'Window 8', 'Window 9', 'Window 10'])]

filtered_df = pivoted_df[columns_to_keep].copy()
filtered_df

Unnamed: 0,Participant,Session_Baseline,Session_Window 1,Session_Window 2,Session_Window 3,Session_Window 4,Session_Window 5,Session_Window 6,HRV_MeanNN_Baseline,HRV_MeanNN_Window 1,...,Phasic_Mean_Window 4,Phasic_Mean_Window 5,Phasic_Mean_Window 6,Tonic_Mean_Baseline,Tonic_Mean_Window 1,Tonic_Mean_Window 2,Tonic_Mean_Window 3,Tonic_Mean_Window 4,Tonic_Mean_Window 5,Tonic_Mean_Window 6
0,0qmpak73,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,2024-05-14 11:27:08.614903,664.885,641.846,...,-0.000292916,0.0016698,-0.000869094,65533.3,65533.3,65533.5,65533.4,65533.2,65533.2,65533.3
1,12jbb73p,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,2024-03-19 09:43:32.701677,,,795.191,788.726,...,-0.0704561,,,916.24,851.493,381.259,168.157,267.522,,
2,1qwm4iwc,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,2024-03-13 14:06:35.894375,739.69,734.918,...,-3.10444,3.67587,-0.472541,2547.61,3346.05,3401.68,4124.02,4755.39,5528.42,5242.02
3,3dvp3eki,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,2024-05-07 11:24:30.438646,773.199,743.996,...,5.37484,-6.42994,0.32342,37936.9,53935.0,40354.7,42373.9,53207.0,48778.5,51824.2
4,43kikdm9,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,2024-03-19 09:45:10.150477,,,805.892,801.951,...,-0.00225769,,,65532.7,65532.7,65532.8,65532.8,65532.7,,
5,4pzbw8ti,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,2024-05-07 11:22:47.564184,889.736,838.873,...,0.937894,0.065888,0.314635,1343.81,1829.96,1874.74,1741.27,1742.2,1803.81,1894.89
6,65l2486h,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,2024-03-19 13:32:25.290483,,701.486,713.243,...,0.00616053,-0.00993486,,825.234,435.64,159.429,151.393,128.091,208.635,
7,8ca9ujvu,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,2024-03-19 09:42:20.689432,,,787.626,713.733,...,0.206959,,,6292.65,9076.7,9688.85,8057.68,6679.84,,
8,8peubku6,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,2024-05-14 11:26:08.329212,1016.3,976.13,...,3.89176,-1.28128,-2.84965,28540.6,32620.6,29342.8,34967.0,29658.8,28147.3,31254.1
9,9csokkab,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,2024-05-14 11:26:40.094803,666.067,686.662,...,-0.715793,-0.467796,0.220361,33892.5,24890.0,31101.6,45684.3,33441.4,22922.9,18985.4


In [135]:
filtered_df = filtered_df.drop(columns=['Session_Baseline', 'Session_Window 1',
       'Session_Window 2', 'Session_Window 3', 'Session_Window 4',
       'Session_Window 5', 'Session_Window 6', 'Session_Time_Window 1','Session_Time_Baseline', 'Session_Time_Window 2', 'Session_Time_Window 3', 'Session_Time_Window 4', 'Session_Time_Window 5', 'Session_Time_Window 6' ])

In [136]:
filtered_df = filtered_df.fillna(method='ffill')

In [137]:
df_participants_combined_new = df_participants_combined[['participant.code','stockmarket.40.player.final_score']]

In [138]:
participants_in_filtered_not_in_combined = set(filtered_df['Participant']) - set(df_participants_combined_new['participant.code'])

participants_in_combined_not_in_filtered = set(df_participants_combined_new['participant.code']) - set(filtered_df['Participant'])

missing_participants = {
    "in_filtered_not_in_combined": participants_in_filtered_not_in_combined,
    "in_combined_not_in_filtered": participants_in_combined_not_in_filtered
}

missing_participants

{'in_filtered_not_in_combined': set(),
 'in_combined_not_in_filtered': {'05qhlznq',
  '1pvmqdfo',
  '5h7twk6a',
  '817n2sy5',
  'avanznds',
  'cmernh12',
  'enkeak0f',
  'h7gbsrk6',
  'pej4oig9',
  'pnvum9qr',
  'vrc5kdqt',
  'wqzbi8s4',
  'wu7bftuk',
  'xvr7iis7',
  'yu1cizf1'}}

In [139]:
merged_df = filtered_df.merge(df_participants_combined_new, left_on='Participant', right_on='participant.code', how='left')
merged_df.drop(columns=['participant.code'], inplace=True)
merged_df.columns

Index(['Participant', 'HRV_MeanNN_Baseline', 'HRV_MeanNN_Window 1',
       'HRV_MeanNN_Window 2', 'HRV_MeanNN_Window 3', 'HRV_MeanNN_Window 4',
       'HRV_MeanNN_Window 5', 'HRV_MeanNN_Window 6', 'HRV_SDNN_Baseline',
       'HRV_SDNN_Window 1',
       ...
       'Phasic_Mean_Window 5', 'Phasic_Mean_Window 6', 'Tonic_Mean_Baseline',
       'Tonic_Mean_Window 1', 'Tonic_Mean_Window 2', 'Tonic_Mean_Window 3',
       'Tonic_Mean_Window 4', 'Tonic_Mean_Window 5', 'Tonic_Mean_Window 6',
       'stockmarket.40.player.final_score'],
      dtype='object', length=436)

In [140]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.decomposition import PCA


In [141]:
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mse, mae, rmse, r2

In [142]:
X = merged_df.drop(columns=['stockmarket.40.player.final_score', 'Participant'])
y = merged_df['stockmarket.40.player.final_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### RandomForestRegressor

In [143]:
from sklearn.model_selection import GridSearchCV


preprocessing_pipeline = Pipeline([
   # ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

ridge_params = {'alpha': [0.1, 1.0, 10.0, 100.0]}

ridge_grid_search = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='neg_mean_squared_error')
ridge_grid_search.fit(X_train_processed, y_train)

best_ridge_model = ridge_grid_search.best_estimator_

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10]
}

rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=5, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train_processed, y_train)

best_rf_model = rf_grid_search.best_estimator_

best_ridge_train_predictions = best_ridge_model.predict(X_train_processed)
best_ridge_test_predictions = best_ridge_model.predict(X_test_processed)

best_ridge_train_metrics = evaluate_model(y_train, best_ridge_train_predictions)
best_ridge_test_metrics = evaluate_model(y_test, best_ridge_test_predictions)

best_rf_train_predictions = best_rf_model.predict(X_train_processed)
best_rf_test_predictions = best_rf_model.predict(X_test_processed)

best_rf_train_metrics = evaluate_model(y_train, best_rf_train_predictions)
best_rf_test_metrics = evaluate_model(y_test, best_rf_test_predictions)

best_ridge_train_metrics, best_ridge_test_metrics, best_rf_train_metrics, best_rf_test_metrics

((833.9354764362889, 20.99282082595111, 28.877941000637303, 0.866128962380444),
 (14614.711031666571,
  87.84347769946348,
  120.89131909143258,
  0.008869304276499568),
 (1756.1482072134143,
  29.131594341818715,
  41.90642202829316,
  0.7180868432195261),
 (17867.0880224025,
  101.12954203882272,
  133.66782717768137,
  -0.2116982226898967))

### GradientBoostingRegressor

In [144]:
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

# Define a pipeline with PCA and scaling
preprocessing_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=30))
])

X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

lasso_params = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

lasso_grid_search = GridSearchCV(Lasso(), lasso_params, cv=5, scoring='neg_mean_squared_error')
lasso_grid_search.fit(X_train_processed, y_train)

best_lasso_model = lasso_grid_search.best_estimator_

gbr_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

gbr_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), gbr_params, cv=5, scoring='neg_mean_squared_error')
gbr_grid_search.fit(X_train_processed, y_train)

best_gbr_model = gbr_grid_search.best_estimator_

def evaluate_model_cv(model, X, y):
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_mse = -cv_scores.mean()
    cv_rmse = np.sqrt(cv_mse)
    return cv_mse, cv_rmse

best_lasso_cv_mse, best_lasso_cv_rmse = evaluate_model_cv(best_lasso_model, X_train_processed, y_train)

best_gbr_cv_mse, best_gbr_cv_rmse = evaluate_model_cv(best_gbr_model, X_train_processed, y_train)

best_lasso_cv_mse, best_lasso_cv_rmse, best_gbr_cv_mse, best_gbr_cv_rmse

best_lasso_test_predictions = best_lasso_model.predict(X_test_processed)
best_gbr_test_predictions = best_gbr_model.predict(X_test_processed)

best_lasso_test_metrics = evaluate_model(y_test, best_lasso_test_predictions)
best_gbr_test_metrics = evaluate_model(y_test, best_gbr_test_predictions)

best_lasso_test_metrics, best_gbr_test_metrics


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


((17799.23055704043,
  107.72555417257196,
  133.41375700069477,
  -0.20709631049959665),
 (19765.956757318912,
  104.20501921596735,
  140.59145335801503,
  -0.3404744322398088))