# Regression analysis and training

In [1]:
import glob
import os
import os.path
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import logging
from sklearn.preprocessing import StandardScaler
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.annotations import Title
from maweight import model_selection
from config import elastix_params
import pickle
output_notebook()
# for get eqution attributes
from sklearn.linear_model import Ridge, Lasso
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math

from config import leg_features_path, loin_features_path, belly_features_path, groin_features_path
from config import shoulder_features_path, body_features_path
from config import xls_path, path_prefix_results

import warnings
warnings.filterwarnings('ignore')

# setting the logging format
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)

Executables being used: /opt/elastix-5.1.0-linux/bin/elastix /opt/elastix-5.1.0-linux/bin/transformix


## Constructing the training dataframe

In [2]:
# Reading the extracted features
["leg", "loin", "belly", "groin", "shoulder", "body"]
leg_features= pd.read_csv(leg_features_path)
loin_features= pd.read_csv(loin_features_path)
belly_features= pd.read_csv(belly_features_path)
groin_features= pd.read_csv(groin_features_path)
shoulder_features= pd.read_csv(shoulder_features_path)
body_features= pd.read_csv(body_features_path)

# Determining the ids and positions
leg_features['id']= leg_features['filename'].apply(lambda x: x.split(os.sep)[-1]).apply(lambda x: x[:-7] if x.endswith(".nii.gz") else x[:-4]) 
loin_features['id']= loin_features['filename'].apply(lambda x: x.split(os.sep)[-1]).apply(lambda x: x[:-7] if x.endswith(".nii.gz") else x[:-4])
belly_features['id']= belly_features['filename'].apply(lambda x: x.split(os.sep)[-1]).apply(lambda x: x[:-7] if x.endswith(".nii.gz") else x[:-4]) 
groin_features['id']= groin_features['filename'].apply(lambda x: x.split(os.sep)[-1]).apply(lambda x: x[:-7] if x.endswith(".nii.gz") else x[:-4])
shoulder_features['id']= shoulder_features['filename'].apply(lambda x: x.split(os.sep)[-1]).apply(lambda x: x[:-7] if x.endswith(".nii.gz") else x[:-4]) 
body_features['id']= body_features['filename'].apply(lambda x: x.split(os.sep)[-1]).apply(lambda x: x[:-7] if x.endswith(".nii.gz") else x[:-4])

# Reading the XLS data
xls_data= pd.read_excel(xls_path, engine='openpyxl')

# Extracting the ground truth data
target= pd.DataFrame(data= {'ct_num': xls_data[u'id'].astype(str).str.zfill(3), 
                            'leg': xls_data[u'HAM'].astype(float),
                            'loin': xls_data[u'LOIN'].astype(float),
                            'belly': xls_data[u'BELLY'].astype(float),
                            'groin': xls_data[u'VENTERAL_PART_BELLY'].astype(float), 
                            'shoulder': xls_data[u'SHOULDER'].astype(float),
                            'body': xls_data[u'L_CW'].astype(float)})

target['id']= target['ct_num']
target= target.sort_values('id')
target= target.reset_index()

# Merging the skull features with the ground truth data
leg_target= target[['id', 'leg']]
loin_target= target[['id', 'loin']]
belly_target= target[['id', 'belly']]
groin_target= target[['id', 'groin']]
shoulder_target= target[['id', 'shoulder']]
body_target= target[['id', 'body']]

leg_data= pd.merge(leg_features, leg_target, how='inner', on='id').dropna()
loin_data= pd.merge(loin_features, loin_target, how='inner', on='id').dropna()
belly_data= pd.merge(belly_features, belly_target, how='inner', on='id').dropna()
groin_data= pd.merge(groin_features, groin_target, how='inner', on='id').dropna()
shoulder_data= pd.merge(shoulder_features, shoulder_target, how='inner', on='id').dropna()
body_data= pd.merge(body_features, body_target, how='inner', on='id').dropna()

In [3]:
leg_target= leg_data['leg']
leg_features= leg_data.drop(['filename', 'id', 'leg'], axis='columns')
loin_target= loin_data['loin']
loin_features= loin_data.drop(['filename', 'id', 'loin'], axis='columns')
belly_target= belly_data['belly']
belly_features= belly_data.drop(['filename', 'id', 'belly'], axis='columns')
groin_target= groin_data['groin']
groin_features= groin_data.drop(['filename', 'id', 'groin'], axis='columns')
shoulder_target= shoulder_data['shoulder']
shoulder_features= shoulder_data.drop(['filename', 'id', 'shoulder'], axis='columns')
body_target= body_data['body']
body_features= body_data.drop(['filename', 'id', 'body'], axis='columns')

# encoding the positions
#skull_features['pos']= skull_features['pos'].apply(lambda x: {'a': 0, 'k': 1, 'f': 2}[x])
#head_features['pos']= head_features['pos'].apply(lambda x: {'a': 0, 'k': 1, 'f': 2}[x])

### get equation attributes

In [4]:
def rr_rmse(model, features, target):
    reg = model.fit(features, target)
    target_ = reg.predict(features)
    rr = r2_score(target, target_)
    mse = mean_squared_error(target, target_)
    rmse = math.sqrt(mse)
    return reg, rr, rmse

def get_equation_params(df_results, df_features, targets):
    #df_results = pd.concat(results)
    df_results["equation_attributes"] = np.nan
    df_results["equation"] = np.nan
    equ_attrib = None
    equ = None
    model = None
    for index, row in df_results.iterrows():
        model_name = row['model']
        print(model_name)
        model_parameter_name = None
        model_parameter = None
        if model_name == 'KNNR_Objective':
            continue
        if model_name == 'LinearRegression_Objective':            
            #for linear_regression
            model = row["parameters"]["model"]
            feature_names = row['features']
            feature_names = [x for x in feature_names if not 'Unnamed' in x] # remove unnnamed items
            #df_selected_features = df_selected["features"]
            #mfit = model.fit(df_selected_features.values, targets)
            features = df_features[feature_names]
            model, rr, rmse = rr_rmse(model, features, targets)
            equ_attrib = [{'coef':model.coef_, 'intercept':model.intercept_, 'r2': rr, 'rmse':rmse }]
#             df_results.loc[df_results["model"]=='LinearRegression_Objective', 'equation_attributes'] = [{'coef':model.coef_, 'intercept':model.intercept_, 'r2': rr, 'rmse':rmse }]
        if model_name in ['LassoRegression_Objective', 'RidgeRegression_Objective', 'PLSRegression_Objective']:
            params = row["parameters"]
            if model_name == 'LassoRegression_Objective':
                alpha = params["model__lasso__alpha"]
                model_parameter_name = "model__lasso__alpha"
                
                model_parameter = alpha
                model = Lasso(alpha=alpha)
            if model_name == 'RidgeRegression_Objective':
                alpha = params["model__ridge__alpha"]
                model_parameter_name = "model__ridge__alpha"
                model_parameter = alpha
                model = Ridge(alpha=alpha)
            if model_name == "PLSRegression_Objective":
                n = params['model__n_components'] 
                model_parameter_name = "model__n_components"
                model_parameter = n
                model = PLSRegression(n_components=n)
            feature_names = row['features']
            feature_names = [x for x in feature_names if not 'Unnamed' in x] # remove unnnamed items
            features = df_features[feature_names]
            model, rr, rmse = rr_rmse(model, features, targets)
            if model_name == "PLSRegression_Objective":
                model.coef_ = model.coef_[0]
                model.intercept_ = model.intercept_[0]
            equ_attrib = [{'coef':model.coef_, 'intercept':model.intercept_, 'r2': rr, 'rmse':rmse, model_parameter_name: model_parameter}] 
            
        equ=" + ".join([f"{i:.4g}*{'-'.join(j.replace('-0.500000','').replace('hist-', 'hist').split('-')[:2])}" for i, j in zip(model.coef_, feature_names)])
        equ = f"{equ} + {model.intercept_:.4g}".replace("+ -", "- ")
        row.loc["equation_attributes"] = equ_attrib
        row.loc["equation"] = equ

        df_results.loc[index] = row
    return df_results
        

## Model selection with feature selection

In [5]:
masks= np.unique([c.split('-')[1] for c in leg_features.columns if ('mean_mask' not in c and len(c) > 10) and len(c.split('-')[1]) == 4])

In [6]:
masks

array(['p004', 'p029', 'p032', 'p041', 'p048'], dtype='<U4')

In [7]:
results= []
results_no_fs= []

### leg using all features

In [None]:
results.append(get_equation_params(model_selection(leg_features, leg_target, dataset='leg', type='all'), leg_features, leg_target))

Objective KNNR_Objective:


  1%|▎                                        | 53/8000 [00:07<17:33,  7.54it/s]

In [None]:
results

### leg using the mean mask features

In [None]:
results.append(get_equation_params(model_selection(leg_features[[c for c in leg_features.columns if 'mean_mask' in c or c == 'type']], leg_target, dataset='leg', type='mean_mask'))

### leg using the features of the individual masks

In [None]:
for m in masks:
    results.append(get_equation_params(model_selection(leg_features[[c for c in leg_features.columns if m in c or c == 'type']], leg_target, dataset='leg', type=m))

### loin using all features

In [None]:
results.append(get_equation_params(model_selection(loin_features, loin_target, dataset='loin', type='all'))

### loin using the mean mask features

In [None]:
results.append(get_equation_params(model_selection(loin_features[[c for c in loin_features.columns if 'mean_mask' in c or c == 'type']], loin_target, dataset='loin', type='mean_mask'))

### loin using the features of the individual masks

In [None]:
for m in masks:
    results.append(get_equation_params(model_selection(loin_features[[c for c in loin_features.columns if m in c or c == 'type']], loin_target, dataset='loin', type=m))

### belly using all features

In [None]:
results.append(get_equation_params(model_selection(belly_features, belly_target, dataset='belly', type='all'))

### belly using the mean mask features

In [None]:
results.append(get_equation_params(model_selection(belly_features[[c for c in belly_features.columns if 'mean_mask' in c or c == 'type']], belly_target, dataset='belly', type='mean_mask'))

### belly using the features of the individual masks

In [None]:
for m in masks:
    results.append(get_equation_params(model_selection(belly_features[[c for c in belly_features.columns if m in c or c == 'type']], belly_target, dataset='belly', type=m))

### groin using all features

In [None]:
results.append(get_equation_params(model_selection(groin_features, groin_target, dataset='groin', type='all'))

### groin using the mean mask features

In [None]:
results.append(get_equation_params(model_selection(groin_features[[c for c in groin_features.columns if 'mean_mask' in c or c == 'type']], groin_target, dataset='groin', type='mean_mask'))

### groin using the features of the individual masks

In [None]:
for m in masks:
    results.append(get_equation_params(model_selection(groin_features[[c for c in groin_features.columns if m in c or c == 'type']], groin_target, dataset='groin', type=m))

### shoulder using all features

In [None]:
results.append(get_equation_params(model_selection(shoulder_features, shoulder_target, dataset='shoulder', type='all'))

### shoulder using the mean mask features

In [None]:
results.append(get_equation_params(model_selection(shoulder_features[[c for c in shoulder_features.columns if 'mean_mask' in c or c == 'type']], shoulder_target, dataset='shoulder', type='mean_mask'))

### shoulder using the features of the individual masks

In [None]:
for m in masks:
    results.append(get_equation_params(model_selection(shoulder_features[[c for c in shoulder_features.columns if m in c or c == 'type']], shoulder_target, dataset='shoulder', type=m))

### body using all features

In [None]:
results.append(get_equation_params(model_selection(body_features, body_target, dataset='body', type='all'))

### body using the mean mask features

In [None]:
results.append(get_equation_params(model_selection(body_features[[c for c in body_features.columns if 'mean_mask' in c or c == 'type']], body_target, dataset='body', type='mean_mask'))

### body using the features of the individual masks

In [None]:
for m in masks:
    results.append(get_equation_params(model_selection(body_features[[c for c in body_features.columns if m in c or c == 'type']], body_target, dataset='body', type=m))

### Saving the results

In [None]:
results= pd.concat(results)
results.to_csv(os.path.join(path_prefix_results,'results.csv'), index=False)
pickle.dump(results, open(os.path.join(path_prefix_results,'results.pickle'), 'wb'))

## Without feature selection

### leg with all features

In [None]:
results_no_fs.append(get_equation_params(model_selection(leg_features, leg_target, dataset='leg', type='all', disable_feature_selection=True))

### leg mean mask

In [None]:
results_no_fs.append(get_equation_params(model_selection(leg_features[[c for c in leg_features.columns if 'mean_mask' in c or c == 'type']], leg_target, dataset='leg', type='mean_mask', disable_feature_selection=True))

### leg individual masks

In [None]:
for m in masks:
    results_no_fs.append(get_equation_params(model_selection(leg_features[[c for c in leg_features.columns if m in c or c == 'type']], leg_target, dataset='leg', type=m, disable_feature_selection=True))

### loin all features

In [None]:
results_no_fs.append(get_equation_params(model_selection(loin_features, loin_target, dataset='loin', type='all', disable_feature_selection=True))

### loin mean mask

In [None]:
results_no_fs.append(get_equation_params(model_selection(loin_features[[c for c in loin_features.columns if 'mean_mask' in c or c == 'type']], loin_target, dataset='loin', type='mean_mask', disable_feature_selection=True))

### loin individual masks

In [None]:
for m in masks:
    results_no_fs.append(get_equation_params(model_selection(loin_features[[c for c in loin_features.columns if m in c or c == 'type']], loin_target, dataset='loin', type=m, disable_feature_selection=True))

### belly with all features

In [None]:
results_no_fs.append(get_equation_params(model_selection(belly_features, belly_target, dataset='belly', type='all', disable_feature_selection=True))

### belly mean mask

In [None]:
results_no_fs.append(get_equation_params(model_selection(belly_features[[c for c in belly_features.columns if 'mean_mask' in c or c == 'type']], belly_target, dataset='belly', type='mean_mask', disable_feature_selection=True))

### belly individual masks

In [None]:
for m in masks:
    results_no_fs.append(get_equation_params(model_selection(belly_features[[c for c in belly_features.columns if m in c or c == 'type']], belly_target, dataset='belly', type=m, disable_feature_selection=True))

### groin all features

In [None]:
results_no_fs.append(get_equation_params(model_selection(groin_features, groin_target, dataset='groin', type='all', disable_feature_selection=True))

### groin mean mask

In [None]:
results_no_fs.append(get_equation_params(model_selection(groin_features[[c for c in groin_features.columns if 'mean_mask' in c or c == 'type']], groin_target, dataset='groin', type='mean_mask', disable_feature_selection=True))

### groin individual masks

In [None]:
for m in masks:
    results_no_fs.append(get_equation_params(model_selection(groin_features[[c for c in groin_features.columns if m in c or c == 'type']], groin_target, dataset='groin', type=m, disable_feature_selection=True))

### shoulder with all features

In [None]:
results_no_fs.append(get_equation_params(model_selection(shoulder_features, shoulder_target, dataset='shoulder', type='all', disable_feature_selection=True))

### shoulder mean mask

In [None]:
results_no_fs.append(get_equation_params(model_selection(shoulder_features[[c for c in shoulder_features.columns if 'mean_mask' in c or c == 'type']], shoulder_target, dataset='shoulder', type='mean_mask', disable_feature_selection=True))

### shoulder individual masks

In [None]:
for m in masks:
    results_no_fs.append(get_equation_params(model_selection(shoulder_features[[c for c in shoulder_features.columns if m in c or c == 'type']], shoulder_target, dataset='shoulder', type=m, disable_feature_selection=True))

### body all features

In [None]:
results_no_fs.append(get_equation_params(model_selection(body_features, body_target, dataset='body', type='all', disable_feature_selection=True))

### body mean mask

In [None]:
results_no_fs.append(get_equation_params(model_selection(body_features[[c for c in body_features.columns if 'mean_mask' in c or c == 'type']], body_target, dataset='body', type='mean_mask', disable_feature_selection=True))

### body individual masks

In [None]:
for m in masks:
    results_no_fs.append(get_equation_params(model_selection(body_features[[c for c in body_features.columns if m in c or c == 'type']], body_target, dataset='body', type=m, disable_feature_selection=True))

### Saving the resulst

In [None]:
results_no_fs= pd.concat(results_no_fs)
results_no_fs.to_csv(os.path.join(path_prefix_results,'results_no_fs.csv'), index=False)
pickle.dump(results_no_fs, open(os.path.join(path_prefix_results,'results_no_fs.pickle'), 'wb'))