In [1]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import pingouin as pg
import scipy.stats as stats

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_squared_error as MSE
from sklearn.preprocessing import MinMaxScaler, StandardScaler 

In [3]:
def get_data_dir(data_name : str) :
    curr_dir = os.getcwd()
    data_dir = curr_dir + "/" + data_name

    return data_dir

def get_data_dict( data_name ) :
    data_dir = get_data_dir(data_name)
    parti_list = os.listdir(data_dir)
    parti_list = [ x for x in parti_list if not x.startswith(".")]
    parti_dict = { x.split(sep='.')[0] : np.load(data_dir + "/" + x) if x.endswith(".npy") \
                  else pd.read_csv(data_dir + "/" + x, index_col=0 ) for x in parti_list}

    return parti_dict

def dict_sort( data_dict ) : 
    sorted_dict = dict(sorted(data_dict.items()))

    return sorted_dict

def key_str2int( data_dict ) : 
    data_dict = { int(key.split(sep='_')[0]) : value for key, value in data_dict.items() }

    return data_dict
    

def get_data_arr( data_dict) :
    
    data_dict = dict_sort(data_dict)

    arr = [ value for key, value in data_dict.items() ]

    arr = np.array(arr)

    return arr

def sliding_wind_mean( data_dict, wind_size = 30, video_len = 120 ) :

    window_means = []

    if wind_size != video_len :
        
        for data in data_dict :
            data_len = len(data_dict[data])
            frame_in_1s = max(30, round(data_len/video_len)) # frames in 1-sec could be 30 of 60 
            wind_frames = frame_in_1s*wind_size

            window_mean = []
            for ind in range(0,video_len-wind_size+1):
                if ind*frame_in_1s + wind_frames <= data_len :
                    window_mean.append(data_dict[data][ind*frame_in_1s:ind*frame_in_1s+wind_frames].mean(axis=0))
                else : window_mean.append(data_dict[data][ind*frame_in_1s:].mean(axis=0))
            window_means.append(np.array(window_mean))
    
    else: 
        for data in data_dict:
            window_means.append(data_dict[data].mean(0))
        

    return np.array(window_means)


In [4]:
window_size = 30

In [5]:
rating_dict = get_data_dict('Dynamic_rating')
feature_dict = get_data_dict('Feature')
v_dict = get_data_dict('HSV')

# Change name of vidoes to int for sorting 
feature_dict = key_str2int(feature_dict)
v_dict = key_str2int(v_dict)

for key, value in feature_dict.items() : 
    feature_dict[key] = value[:3600]

rating_arr = get_data_arr(rating_dict)
feature_arr = get_data_arr(feature_dict)
v_arr = get_data_arr(v_dict)

print(feature_arr.shape)
print(v_arr.shape)
print(rating_arr.shape)

(20, 3600)
(20, 3600, 256, 256)
(30, 7150, 20)


In [6]:
feature_wind = sliding_wind_mean(feature_dict,window_size)
v_wind = sliding_wind_mean(v_dict,window_size)
rating_wind = sliding_wind_mean(rating_dict, wind_size=window_size).mean(axis=0)

print(feature_wind.shape)
print(v_wind.shape)
print(rating_wind.shape)

(20, 91)
(20, 91, 256, 256)
(91, 20)


In [7]:
v_wind = v_wind.mean(axis=(2,3))
rating_wind = rating_wind.T

print(feature_wind.shape)
print(v_wind.shape)
print(rating_wind.shape)

(20, 91)
(20, 91)
(20, 91)


In [8]:
models = {
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'SVR': SVR(),
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'GaussianProcess': GaussianProcessRegressor()
}

params = {
    'RandomForest': {'n_estimators': [10, 50, 100, 200],
                     'max_depth': [None, 5, 10, 20],
                     'min_samples_split': [2, 5, 10],
                     'min_samples_leaf': [1, 2, 4],},

    'DecisionTree': {'max_depth': [None, 5, 10, 20],
                     'min_samples_split': [2, 5, 10],
                     'min_samples_leaf': [1, 2, 4]},

    'GradientBoosting': {'n_estimators': [50, 100, 200],
                         'learning_rate': [0.01, 0.1, 0.5],
                         'max_depth': [3, 5, 7],
                         'subsample': [0.5, 0.8, 1.0]},


    'KNN': {'n_neighbors': [3, 5, 7, 10],},

    'SVR': {'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},

    'GaussianProcess': {},

    'LinearRegression': {},

    'Lasso': {'alpha': [0.1, 1, 10, 100]},

    'Ridge': {'alpha': [0.1, 1, 10, 100]}
}

In [11]:
# 영상별 평균을 이용한 Model

feature = feature_arr.mean(axis=1)
v = v_arr.mean(axis=(1,2,3))
rating = rating_arr.mean(axis=(0,1))

# Load the dataset
X, y = np.array([feature.flatten(), v.flatten()]).T, rating.flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True )
    grid_search.fit(X_train, y_train)
    pred = grid_search.best_estimator_.predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 for {model_name}: {r2:.2f}")
    print(f"Best MSE for {model_name}: {mse:.2f}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

Best parameters for RandomForest: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
Best R2 for RandomForest: 0.94
Best MSE for RandomForest: 0.02
Best score for RandomForest: 0.65

Best parameters for GradientBoosting: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.5}
Best R2 for GradientBoosting: 0.97
Best MSE for GradientBoosting: 0.01
Best score for GradientBoosting: 0.52

Best parameters for DecisionTree: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'random'}
Best R2 for DecisionTree: 0.89
Best MSE for DecisionTree: 0.04
Best score for DecisionTree: 0.48

Best parameters for KNN: {'n_neighbors': 3}
Best R2 for KNN: 0.95
Best MSE for KNN: 0.02
Best score for KNN: 0.61

Best parameters for SVR: {'C': 1, 'kernel': 'rbf'}
Best R2 for SVR: 0.91
Best MSE for SVR: 0.03
Best score for SVR: 0.51

Best parameters for LinearRegression: {}
Best R2 for LinearRegression: 0.22
Best MSE for LinearRegres

In [12]:
# 30초 구간별 평균 

feature = feature_arr.reshape(-1, 4, 900).mean(axis=-1)
v = v_arr.mean(axis=(2,3)).reshape(-1, 4, 900).mean(axis=-1)

rating = rating_arr.mean(axis=0).T
rating = np.array([ rating[:,x:x+1800].mean(axis=1) if x + 1800 < rating.shape[1]  else rating[:,x:].mean(axis=1) for x in range(0, rating.shape[1], 1800)])

# Load the dataset
X, y = np.array([feature.flatten(), v.flatten()]).T, rating.flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True )
    grid_search.fit(X_train, y_train)
    pred = grid_search.best_estimator_.predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 for {model_name}: {r2:.2f}")
    print(f"Best MSE for {model_name}: {mse:.2f}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

Best parameters for RandomForest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best R2 for RandomForest: 0.10
Best MSE for RandomForest: 0.32
Best score for RandomForest: 0.29

Best parameters for GradientBoosting: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Best R2 for GradientBoosting: 0.08
Best MSE for GradientBoosting: 0.32
Best score for GradientBoosting: 0.22

Best parameters for DecisionTree: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}
Best R2 for DecisionTree: -0.69
Best MSE for DecisionTree: 0.59
Best score for DecisionTree: 0.28

Best parameters for KNN: {'n_neighbors': 7}
Best R2 for KNN: -0.20
Best MSE for KNN: 0.42
Best score for KNN: -0.02

Best parameters for SVR: {'C': 0.1, 'kernel': 'rbf'}
Best R2 for SVR: -0.10
Best MSE for SVR: 0.39
Best score for SVR: -0.07

Best parameters for LinearRegression: {}
Best R2 for LinearRegression: 0.05
Best MSE for Line

In [13]:
feature_wind_1s = sliding_wind_mean(feature_dict,wind_size=1)
v_wind_1s = sliding_wind_mean(v_dict,wind_size=1).mean(axis=(2,3))
rating_wind_1s = sliding_wind_mean(rating_dict, wind_size=1).mean(axis=0).T

print(feature_wind_1s.shape)
print(v_wind_1s.shape)
print(rating_wind_1s.shape)

(20, 120)
(20, 120)
(20, 120)


In [14]:
# 1s Window별 평균을 이용한 Model

# Load the dataset
X, y = np.array([feature_wind_1s.flatten(), v_wind_1s.flatten()]).T, rating_wind_1s.flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True )
    grid_search.fit(X_train, y_train)
    pred = grid_search.best_estimator_.predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 for {model_name}: {r2:.2f}")
    print(f"Best MSE for {model_name}: {mse:.2f}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

Best parameters for RandomForest: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best R2 for RandomForest: 0.51
Best MSE for RandomForest: 0.23
Best score for RandomForest: 0.49

Best parameters for GradientBoosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
Best R2 for GradientBoosting: 0.48
Best MSE for GradientBoosting: 0.25
Best score for GradientBoosting: 0.50

Best parameters for DecisionTree: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'random'}
Best R2 for DecisionTree: 0.49
Best MSE for DecisionTree: 0.24
Best score for DecisionTree: 0.46

Best parameters for KNN: {'n_neighbors': 10}
Best R2 for KNN: 0.45
Best MSE for KNN: 0.26
Best score for KNN: 0.48

Best parameters for SVR: {'C': 100, 'kernel': 'rbf'}
Best R2 for SVR: 0.37
Best MSE for SVR: 0.30
Best score for SVR: 0.39

Best parameters for LinearRegression: {}
Best R2 for LinearRegression: 0.27
Best MSE for Linear

In [15]:
# 30s Window별 평균을 이용한 Model

# Load the dataset
X, y = np.array([feature_wind.flatten(), v_wind.flatten()]).T, rating_wind.flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearch for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, params[model_name], cv=5, refit=True )
    grid_search.fit(X_train, y_train)
    pred = grid_search.best_estimator_.predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 for {model_name}: {r2:.2f}")
    print(f"Best MSE for {model_name}: {mse:.2f}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.2f}\n")

Best parameters for RandomForest: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best R2 for RandomForest: 0.88
Best MSE for RandomForest: 0.06
Best score for RandomForest: 0.85

Best parameters for GradientBoosting: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.8}
Best R2 for GradientBoosting: 0.87
Best MSE for GradientBoosting: 0.07
Best score for GradientBoosting: 0.83

Best parameters for DecisionTree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'random'}
Best R2 for DecisionTree: 0.87
Best MSE for DecisionTree: 0.07
Best score for DecisionTree: 0.83

Best parameters for KNN: {'n_neighbors': 5}
Best R2 for KNN: 0.80
Best MSE for KNN: 0.11
Best score for KNN: 0.81

Best parameters for SVR: {'C': 100, 'kernel': 'rbf'}
Best R2 for SVR: 0.60
Best MSE for SVR: 0.21
Best score for SVR: 0.58

Best parameters for LinearRegression: {}
Best R2 for LinearRegression: 0.38
Best MSE for LinearRe