In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVR

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/ML-DXA-BIS/ML-DXA-BIS_Combined Data.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

In [12]:
#number of individuals before NA is removed
print(len(df))

544


In [13]:
#this is the number of people we have when race IS INCLUDED
df = df.dropna()
print(len(df))

422


In [14]:
#counting the total number of individuals who are white or black (ignore the other races for now...)
race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
df = df[race]

print(len(df))

413


# Support Vector Regression

### QUICK NOTE: fitting svr with scaling vs no scaling depends on how we want to handle interpretation (scaled features vs not scaled feature interpretation)...we fit our model without scaling

In [15]:
#SVR function
def train_test_svr(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    y = X.pop(target_column)

    #splitting our data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #scaling features (did not used the scaled data)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #ourparameter grid for hyperparameter tuning
    param_grid = {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': [0.1, 1, 10],
        'epsilon': [0.01, 0.1, 1]
    }
    
    svr = SVR()

    #doing grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    #our best hyperparameters from the grid search results
    best_params = grid_search.best_params_

    print("Best Hyperparameters:", best_params)
    
    best_svr = grid_search.best_estimator_    
    svr = SVR(kernel = best_params['kernel'], C = best_params['C'], epsilon = best_params['epsilon'])
    svr.fit(X_train, y_train) #X_train vs X_train_scaled...

    #evaluation of the model on the TRAINING set
    y_train_pred = svr.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    #evaluation of the model on TESTING set
    y_test_pred = svr.predict(X_test)
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")
    
    #frequency count for race
    if 'COMB1PF7A' in df.columns:
        white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
        white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

        black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
        black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
        print("White train frequency: ", white_train_counts.values[0])
        print("White test frequency: ", white_test_counts.values[0], "\n")
    
        print("Black train frequency: ", black_train_counts.values[0])
        print("Black test frequency: ", black_test_counts.values[0])

In [16]:
#DXA: Handgrip Strength (ARMS)
print("Support Vector Regression: DXA Model, Handgrip Strength (Arms)")
train_test_svr(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Support Vector Regression: DXA Model, Handgrip Strength (Arms)


Best Hyperparameters: {'C': 1, 'epsilon': 1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.5564231614212546
R-squared: 0.46210640260541413

Test set metrics:
MSE: 0.5446845614069277
R-squared: 0.42978720281577854 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [17]:
#DXA: Handgrip Strength (TOTAL BODY)

print("Support Vector Regression: DXA Model, Handgrip Strength (TB)")
train_test_svr(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Support Vector Regression: DXA Model, Handgrip Strength (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.7314238755987985
R-squared: 0.292933423797086

Test set metrics:
MSE: 0.6863988998039168
R-squared: 0.2814310072780676 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [18]:
#DXA: Jump Power (LEGS)

print("Support Vector Regression: DXA Model, Jump Power (Legs)")
train_test_svr(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Support Vector Regression: DXA Model, Jump Power (Legs)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07349718504368476
R-squared: 0.42904530547475506

Test set metrics:
MSE: 0.08350454110071258
R-squared: 0.3875751994651504 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [19]:
#DXA: Jump Power (TOTAL BODY)

print("Support Vector Regression: DXA Model, Jump Power (TB)")
train_test_svr(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Support Vector Regression: DXA Model, Jump Power (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07802851905928385
R-squared: 0.3938441419590304

Test set metrics:
MSE: 0.08452207028454224
R-squared: 0.3801126100151504 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [20]:
#BIS: Handgrip Strength

print("Support Vector Regression: BIS Model, Handgrip Strength")
train_test_svr(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Support Vector Regression: BIS Model, Handgrip Strength
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.820340724221435
R-squared: 0.20697761373976442

Test set metrics:
MSE: 0.8460022338407766
R-squared: 0.11434739597465393 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [21]:
#BIS: Jump Power

print("Support Vector Regression: BIS Model, Jump Power")
train_test_svr(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Support Vector Regression: BIS Model, Jump Power
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07820887352559382
R-squared: 0.3924430783787439

Test set metrics:
MSE: 0.09125113611159222
R-squared: 0.33076144009558084 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [22]:
#COMBO: Handgrip Strength (ARMS)

print("Support Vector Regression: Combo Models, Handgrip Strength (Arms)")
train_test_svr(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Support Vector Regression: Combo Models, Handgrip Strength (Arms)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.5581385175822817
R-squared: 0.4604481698785249

Test set metrics:
MSE: 0.6043958421202077
R-squared: 0.3672773782100932 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [23]:
#COMBO: Handgrip Strength (Total Body)

print("Support Vector Regression: Combo Models, Handgrip Strength (TB)")
train_test_svr(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Support Vector Regression: Combo Models, Handgrip Strength (TB)


In [None]:
#COMBO: Jump Power (LEGS)

print("Support Vector Regression: Combo Models, Jump Power (Legs)")
train_test_svr(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

In [None]:
#COMBO: Jump Power (TOTAL BODY)

print("Support Vector Regression: Combo Models, Jump Power (TB)")
train_test_svr(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')