In [11]:
import os
import glob
import pandas as pd
import numpy as np 
import pingouin as pg 
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_squared_error as MSE
from sklearn.preprocessing import MinMaxScaler, StandardScaler 




Model

RandomForest Regression  
Decision Tree Regression  
KNN Regrssion  
SVR  
LR  
Lasso Regression     
Rdige Regression  
Rogistic Regression   


In [17]:

# Define the models
models = {
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'SVR': SVR(),
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'GaussianProcess': GaussianProcessRegressor()
}

# Define the parameters for grid search for each model
params = {
    'RandomForest': {'n_estimators': [10, 50, 100, 200],
                     'max_depth': [None, 5, 10, 20],
                     'min_samples_split': [2, 5, 10],
                     'min_samples_leaf': [1, 2, 4],},

    'DecisionTree': {'splitter': ['best', 'random'],
                     'max_depth': [None, 5, 10, 20],
                     'min_samples_split': [2, 5, 10],
                     'min_samples_leaf': [1, 2, 4]},

    'GradientBoosting': {'n_estimators': [50, 100, 200],
                         'learning_rate': [0.01, 0.1, 0.5],
                         'max_depth': [3, 5, 7],
                         'subsample': [0.5, 0.8, 1.0]},


    'KNN': {'n_neighbors': [3, 5, 7, 10],},

    'SVR': {'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},

    'GaussianProcess': {},

    'LinearRegression': {},

    'Lasso': {'alpha': [0.1, 1, 10, 100]},

    'Ridge': {'alpha': [0.1, 1, 10, 100]}


}

In [18]:
curr_dir = os.getcwd()
avg_dir = curr_dir +"/AVG" 

rating_data = pd.read_csv( avg_dir + "/Dynamic_Rating.csv", index_col=0).values
rating_data.shape

(52, 16)

In [32]:
data_list = os.listdir(avg_dir)
data_list = sorted([ data for data in data_list if not data.startswith(".") and data != 'Dynamic_Rating.csv' and not data.endswith(".npy")])

avg_data = { str(x).split(sep=".")[0] : pd.read_csv(f"{avg_dir}/{x}", index_col=0).values for x in data_list}

avg_arr = [ ]

for ind, x in avg_data.items() : 
    # avg_arr.append(x.flatten()) 
    avg_arr.append(x.mean(axis=1)) # 52명 데이터를 평균내는 경우
avg_arr = np.array(avg_arr).T

# for key in avg_data.keys() : 
#     print (key)
#     print(avg_data[key].shape)
#     print("-"*20)

print("avg_data")
print(avg_arr.shape)

avg_data
(52, 8)


In [33]:
# Dynamic ratign DF
# arr = np.concatenate([avg_arr, rating_data.flatten().reshape(-1,1)], axis=1)
arr = np.concatenate([avg_arr, rating_data.mean(axis=1).reshape(-1,1)], axis=1)
print( "arr shape : ", arr.shape)

names = ['BK', 'FD', 'SCN','SCD', 'LIT', 'LPD', 'RIT', 'RPD', 'DR']
df = pd.DataFrame(arr, columns= names ) 
print("Dataframe shape :",df.shape)

arr shape :  (52, 9)
Dataframe shape : (52, 9)


In [34]:
# Load the dataset
X, y = df[['BK']], df['DR']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True )
    grid_search.fit(X_train, y_train)
    pred = grid_search.best_estimator_.predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 for {model_name}: {r2:.2f}")
    print(f"Best MSE for {model_name}: {mse:.2f}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

Best parameters for RandomForest: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 10}
Best R2 for RandomForest: -0.82
Best MSE for RandomForest: 1.00
Best score for RandomForest: 0.20

Best parameters for GradientBoosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.5}
Best R2 for GradientBoosting: -0.74
Best MSE for GradientBoosting: 0.96
Best score for GradientBoosting: 0.16

Best parameters for DecisionTree: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}
Best R2 for DecisionTree: -0.67
Best MSE for DecisionTree: 0.92
Best score for DecisionTree: 0.14

Best parameters for KNN: {'n_neighbors': 5}
Best R2 for KNN: -0.52
Best MSE for KNN: 0.84
Best score for KNN: 0.12

Best parameters for SVR: {'C': 1, 'kernel': 'rbf'}
Best R2 for SVR: -0.02
Best MSE for SVR: 0.56
Best score for SVR: 0.06

Best parameters for LinearRegression: {}
Best R2 for LinearRegression: 0.06
Best MSE for Linea

In [44]:
RF = RandomForestRegressor()
Tree = DecisionTreeRegressor()
GB = GradientBoostingRegressor()
knn = KNeighborsRegressor()
svr = SVR()
GP = GaussianProcessRegressor()
LR = LinearRegression()
lasso = Lasso()
ridge = Ridge()

MinMax = MinMaxScaler()
STD = StandardScaler()

estimators = [ RF, Tree, GB, knn, svr, GP, LR, lasso, ridge ]

X, y = df[['BK']], df['DR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train = MinMax.fit_transform(X_train)
# X_test = MinMax.fit_transform(X_test)
# X_train = STD.fit_transform(X_train)
# X_test = STD.fit_transform(X_test)

for estimator in estimators :
    estimator.fit(X_train, y_train)
    pred = estimator.predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test,pred)

    print(f"[{estimator}]")
    print(f"R2 : {r2:.2f}")
    print(f"MSE : {mse:.2f}")
    print("-"*50 + "\n")


[RandomForestRegressor()]
R2 : -1.43
MSE : 1.33
--------------------------------------------------

[DecisionTreeRegressor()]
R2 : -2.22
MSE : 1.77
--------------------------------------------------

[GradientBoostingRegressor()]
R2 : -1.77
MSE : 1.52
--------------------------------------------------

[KNeighborsRegressor()]
R2 : -0.52
MSE : 0.84
--------------------------------------------------

[SVR()]
R2 : -0.02
MSE : 0.56
--------------------------------------------------

[GaussianProcessRegressor()]
R2 : -21412.55
MSE : 11760.34
--------------------------------------------------

[LinearRegression()]
R2 : 0.06
MSE : 0.52
--------------------------------------------------

[Lasso()]
R2 : 0.06
MSE : 0.52
--------------------------------------------------

[Ridge()]
R2 : 0.06
MSE : 0.52
--------------------------------------------------



In [None]:
# Load the dataset
X, y = df[['BK', 'FD', 'SCN','SCD', 'LIT', 'LPD', 'RIT', 'RPD']], df['DR']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True, scoring='r2' )
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

-----

In [None]:
data_list = os.listdir(avg_dir)
data_list = sorted([ data for data in data_list if not data.startswith(".") and data != 'Dynamic_Rating.csv' and not data.endswith(".npy")])

avg_data = { str(x).split(sep=".")[0] : pd.read_csv(f"{avg_dir}/{x}", index_col=0).values for x in data_list}

avg_arr = [ ]

for ind, x in avg_data.items() : 
    avg_arr.append(x.mean(axis=1)) # 52명 데이터를 평균내는 경우

avg_arr = np.array(avg_arr).T

# for key in avg_data.keys() : 
#     print (key)
#     print(avg_data[key].shape)
#     print("-"*20)

print("Avg_data")
print(avg_arr.shape)

In [None]:
# Dynamic ratign DF

arr = np.concatenate([avg_arr, rating_data.mean(axis=1).reshape(-1,1)], axis=1) # 52명 데이터를 평균내는 경우
print( "arr shape : ", arr.shape)

names = ['BK', 'FD', 'SCN','SCD', 'LIT', 'LPD', 'RIT', 'RPD', 'DR']
df = pd.DataFrame(arr, columns= names ) # 52명 데이터를 평균내는 경우
print("Dataframe shape :",df.shape)

In [None]:
# SSQ 데이터 
items = ['Bleary' , 'Dry_Eyed' , 'Eyestrain', 'Gritty', 'Eye_Ache', 'Sting', \
        'Heavy_Eyes', 'Hazy', 'Warm_Eyes', 'Flickering', 'Watery_Eyes', 'Feeling_heavy_in_the_head', 'Feel_heavy', \
        'Difficulty_concentrating', 'Dizzy', 'Stiff_shoulder', 'Stiff_neck', 'Sleepy', 'Vomiiting', 'Vertigo', 'Nausea', \
        'Difficulty_focussing', 'Double_vision', 'Near_vision_difficulty', \
        'Far_vision_difficulty', 'Pain_in_the_temple', 'Pain_in_the_middle_of_the_head', 'Pain_in_the_back_of_the_head' ]

ssq_arr = np.load(avg_dir+"/ssq.npy")

ssq_arr = ssq_arr[..., 1:]

ssq_arr.shape

In [None]:
df["SSQ"] = ssq_arr.mean(axis=(0,2))

In [None]:
# Load the dataset
X, y = df[['BK', 'FD', 'SCN','SCD', 'LIT', 'LPD', 'RIT', 'RPD', 'SSQ']], df['DR']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True, scoring='r2' )
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

----


In [None]:
for ind, ssq in enumerate(ssq_arr) : 
    print(f"[{items[ind]}]")
    print("="*50)

    X, y = df[['BK', 'FD', 'SCN','SCD', 'LIT', 'LPD', 'RIT', 'RPD']], ssq.mean(axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Perform GridSearch for each model
    for model_name, model in models.items():
        grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True, scoring='r2' )
        grid_search.fit(X_train, y_train)
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")
    print("="*50)
    print("\n")