In [58]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [25]:
def preprocess_clinical_data(oppScrData, mean=True):
    clinical_data = oppScrData.filter(['Record ID','Clinical F/U interval  [d from CT]','BMI','BMI >30', 'Sex', 'Tobacco', 'Met Sx', 'FRAX 10y Fx Prob (Orange-w/ DXA)',
                                'FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)','Age at CT', 'DEATH [d from CT]' ], axis=1)
    # Replace all _,X,blanks with nan
    clinical_data = clinical_data.replace(r'_', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'X', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'^\s*$', np.nan, regex=True)

    # Fill na in bmi column with mean
    clinical_data['BMI'].fillna(value=clinical_data['BMI'].mean(skipna=True), inplace=True)
    
    # Fill na in bmi>30 column based on bmi col
    clinical_data.loc[clinical_data.BMI>30, 'BMI >30'] = 1
    clinical_data.loc[clinical_data.BMI<=30, 'BMI >30'] = -1
    
    clinical_data['Sex'] = np.where(clinical_data['Sex']=='Male',1,-1)
    clinical_data['Met Sx'] = np.where(clinical_data['Met Sx']=='Y',1,-1) 

    # Treat no data in tobacco as no tobacco usage 
    clinical_data['Tobacco'] = np.where(clinical_data['Tobacco']=='Yes',1,-1) 
  
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace("<1", 0.01, regex=True)
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace(">30", 0.30, regex=True)
    clinical_data['FRS 10-year risk (%)'] =  clinical_data['FRS 10-year risk (%)'] * 100
   
    cols_to_be_filled = ['FRAX 10y Fx Prob (Orange-w/ DXA)','FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)']
    for c in cols_to_be_filled:
      if mean:  
        clinical_data[c].fillna(value=clinical_data[c].mean(skipna=True), inplace=True)
      else :
        clinical_data[c].fillna(value=clinical_data[c].median(skipna=True), inplace=True)

    # clinical_data['Age_at_death'] = np.where(clinical_data['DEATH [d from CT]'].isna(), np.nan, (clinical_data['Age at CT']*365 + clinical_data['DEATH [d from CT]'])/365)
    clinical_data = fill_death_col(clinical_data)
    return [clinical_data, np.array(clinical_data, dtype=np.float32)]

In [26]:
def clean_ct_data(oppScrData):
    # Delete rows with empty values
    ct_data= oppScrData[["Record ID", "L1_HU_BMD", "TAT Area (cm2)", 'Total Body                Area EA (cm2)',
       'VAT Area (cm2)', 'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Age at CT']]
    ct_data['Liver HU    (Median)'].replace(' ', np.nan, inplace=True)
    ct_data.dropna(axis = 0, how ='any', thresh = None, subset = None, inplace=True)
    return [ct_data, np.array(ct_data, dtype=np.float32)]



In [27]:
def fill_death_col(clinical_data):
  
  male_obese = (clinical_data['Sex'] == 1) & (clinical_data['BMI'] > 30)
  female_obese = (clinical_data['Sex'] == -1) & (clinical_data['BMI'] > 30)
  male_underweight = (clinical_data['Sex'] == 1) & (clinical_data['BMI'] < 18.5)
  female_underweight = (clinical_data['Sex'] == -1) & (clinical_data['BMI'] < 18.5)
  clinical_data['BMI_reduction_factor'] = 0
  clinical_data.loc[male_obese, 'BMI_reduction_factor'] = 4.2
  clinical_data.loc[female_obese, 'BMI_reduction_factor'] = 3.5
  clinical_data.loc[male_underweight, 'BMI_reduction_factor'] = 4.3
  clinical_data.loc[female_underweight, 'BMI_reduction_factor'] = 4.5

  clinical_data['Tobacco_reduction_factor'] = np.where(clinical_data['Tobacco']==1,10,0)

  
  #Leave na on rows with no death data, fill the rest with age*365+death days 
  # clinical_data['Age_at_death'] = np.where(clinical_data['DEATH [d from CT]'].isna(), np.nan, clinical_data['Age at CT']*365 + clinical_data['DEATH [d from CT]'])
  clinical_data['Age_at_death'] = np.where(clinical_data['DEATH [d from CT]'].isna(), np.nan, clinical_data['Age at CT'] + clinical_data['DEATH [d from CT]']/365)
  clinical_data['generated_age_at_death'] = np.where(clinical_data['DEATH [d from CT]'].isna(), 1, -1)
  # fill death with avg lifespan - alpha(reduction for tobacco) - beta(reduction for bmi)
  corr_bmi_death = clinical_data['BMI'].corr(clinical_data['Age_at_death'])
  corr_tobacco_death = clinical_data['Tobacco'].corr(clinical_data['Age_at_death'])

  # print(corr_bmi_death)
  # print(corr_tobacco_death)

  #normalize
  alpha = corr_bmi_death/(corr_bmi_death+corr_tobacco_death)
  beta = corr_tobacco_death/(corr_bmi_death+corr_tobacco_death)

  # print(alpha)
  # print(beta)
  
  avg_lifespan_male=76.3  
  avg_lifespan_female=81.2

  clinical_data['avg_life'] = np.where(clinical_data['Sex']==1, avg_lifespan_male, avg_lifespan_female)
  clinical_data['avg_life'] = clinical_data['avg_life'] - alpha*(clinical_data['BMI_reduction_factor'])-beta*(clinical_data['Tobacco_reduction_factor'])
  clinical_data['DEATH [d from CT]'].fillna((clinical_data['avg_life']-clinical_data['Age at CT'])*365, inplace=True)
  clinical_data['Age_at_death'].fillna(clinical_data['Age at CT'] + clinical_data['DEATH [d from CT]']/365, inplace=True)
  
  return clinical_data


In [22]:
oppScrData = pd.read_excel (r'OppScrData.xlsx')  

In [29]:
clinical_data, clinical_data_np = preprocess_clinical_data(oppScrData)
ct_data, ct_data_np = clean_ct_data(oppScrData)
ct_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ct_data.dropna(axis = 0, how ='any', thresh = None, subset = None, inplace=True)


Unnamed: 0,Record ID,L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median),Age at CT
1,2236,192.0,485.502857,694.314286,183.497143,302.008571,0.607589,16.150123,123.968745,48.413187,2709.064,52.0,55
2,5962,256.0,490.054545,706.051515,159.706061,330.348485,0.483447,23.337964,136.566261,50.101332,0.000,54.0,52
3,665,149.0,289.108108,584.489189,144.002703,145.102703,0.992419,30.804567,212.296726,69.115854,2586.575,57.0,60
4,478,106.0,315.530769,588.892308,202.317949,113.212821,1.787059,-3.181874,168.923950,47.814581,431.519,53.0,88
5,8952,94.0,247.412821,601.705128,145.653846,101.758974,1.431361,40.243137,174.813554,53.751548,35.760,54.0,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9217,9262,148.0,420.046763,642.357864,157.105587,262.941176,0.597493,31.595440,165.413121,55.447848,0.000,38.0,52
9218,9287,167.0,185.183362,500.051686,92.813192,92.370170,1.004796,41.896333,188.676052,59.683360,139.967,54.0,51
9219,956,135.0,822.287179,1154.605128,247.784615,574.502564,0.431303,17.102222,143.309862,57.786350,1115.997,45.0,63
9220,6097,219.0,418.200000,660.346154,213.748718,204.448718,1.045488,29.610469,138.824463,50.929786,3515.371,44.0,55


In [30]:
ct_data['Age_at_death'] = clinical_data['Age_at_death']
ct_data['generated_age_at_death'] = clinical_data['generated_age_at_death']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ct_data['Age_at_death'] = clinical_data['Age_at_death']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ct_data['generated_age_at_death'] = clinical_data['generated_age_at_death']


In [31]:
print(ct_data['generated_age_at_death'])

1      -1
2       1
3      -1
4      -1
5       1
       ..
9217    1
9218    1
9219    1
9220    1
9221    1
Name: generated_age_at_death, Length: 8878, dtype: int64


In [72]:
ct_data

df=ct_data[ct_data['generated_age_at_death']==-1]
df2=ct_data[ct_data['generated_age_at_death']==1]
len(df)
df.columns
#df.shape

Index(['Record ID', 'L1_HU_BMD', 'TAT Area (cm2)',
       'Total Body                Area EA (cm2)', 'VAT Area (cm2)',
       'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Age at CT', 'Age_at_death',
       'generated_age_at_death'],
      dtype='object')

In [66]:
#cleaning df

df.drop(columns=['Record ID','Age at CT','generated_age_at_death'], inplace=True)
df.columns

df2.drop(columns=['Record ID','Age at CT','generated_age_at_death'], inplace=True)
df2.columns


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Index(['L1_HU_BMD', 'TAT Area (cm2)',
       'Total Body                Area EA (cm2)', 'VAT Area (cm2)',
       'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Age_at_death'],
      dtype='object')

In [67]:
X_new=df.drop(columns=['Age_at_death'])

y_new=df['Age_at_death']

In [74]:
X_new.columns
#X_new.shape

Index(['L1_HU_BMD', 'TAT Area (cm2)',
       'Total Body                Area EA (cm2)', 'VAT Area (cm2)',
       'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)'],
      dtype='object')

In [75]:
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

#clinical_data

In [70]:
#selected_rows = clinical_data[~clinical_data['DEATH [d from CT]'].isnull()]
#selected_rows

#selected_row2=clinical_data[clinical_data['DEATH [d from CT]'].isnull()]

#X=selected_rows.iloc[:, 0:9]
#X
#X2=selected_row2.iloc[:, 0:9]

#y=selected_rows.iloc[:, 9]
#y
#y2=selected_row2.iloc[:, 9]

from sklearn.preprocessing import MinMaxScaler,StandardScaler
sc = StandardScaler()
#X = pd.DataFrame(sc.fit_transform(X), columns = X.columns)
#X2=pd.DataFrame(sc.fit_transform(X2), columns = X.columns)
#X
#X2

X_new=pd.DataFrame(sc.fit_transform(X_new), columns = X_new.columns)

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(X_new, y_new, test_size=0.2, random_state=0)



In [54]:
RFmod2 = RandomForestRegressor(random_state=21)

#Choose some hyperparameter values 
param_grid2 = {
    'bootstrap': [True],
    'max_depth': [10,20,30],
    'max_features': [2, 3,4,5,6],
    'min_samples_leaf': [3, 4, 5,6,7,8,9,10],
    'min_samples_split': [8, 10, 12,14,16,18,20],
    'n_estimators': [50,30,40,20,10,100, 200, 300, 1000]
}
clfRF2 = RandomizedSearchCV(RFmod2,param_grid2,#model and parameters
                             #number of cross validation folds
                             )


clfRF2.fit(X_train,y_train)
params3=clfRF2.best_params_
params3

{'n_estimators': 200,
 'min_samples_split': 18,
 'min_samples_leaf': 3,
 'max_features': 3,
 'max_depth': 20,
 'bootstrap': True}

In [59]:
RFparams2={'n_estimators':[200],
          'min_samples_split':[18],
          'min_samples_leaf': [3],
           'max_features': [3],
           'max_depth': [20],
           'bootstrap': [True]}

rf = RandomForestRegressor(random_state = 21)


clf= GridSearchCV(rf,RFparams2,refit=True)
clf.fit(X_train,y_train)
        
clf.best_estimator_.score(X_test, y_test)

0.19662576261960418

In [60]:
p=clf.predict(X_test)
null=clf.predict(X_new)


In [61]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [62]:
Mse = mean_squared_error(y_test, p
                         )
error=mean_absolute_error(y_test, p)
mape=mean_absolute_percentage_error(y_test, p)
print("RMSE: %.2f" % (Mse**(1/2.0)))
print("MApe: %.2f" %(mape))
print("error",error)

RMSE: 10.04
MApe: 0.11
error 8.129270676441093


In [64]:
from sklearn.neighbors import KNeighborsRegressor
df

Unnamed: 0,L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median),Age_at_death
1,192.0,485.502857,694.314286,183.497143,302.008571,0.607589,16.150123,123.968745,48.413187,2709.064,52.0,55.983562
3,149.0,289.108108,584.489189,144.002703,145.102703,0.992419,30.804567,212.296726,69.115854,2586.575,57.0,66.441096
4,106.0,315.530769,588.892308,202.317949,113.212821,1.787059,-3.181874,168.923950,47.814581,431.519,53.0,89.802740
6,77.0,213.537143,452.882857,113.628571,99.908571,1.137326,1.489383,89.505890,30.905415,1599.994,49.0,87.397260
15,113.0,282.230769,525.187179,175.897436,106.333333,1.654208,8.435009,128.848958,47.270127,10194.401,105.0,84.304110
...,...,...,...,...,...,...,...,...,...,...,...,...
8758,167.0,806.529326,1442.195831,648.017450,158.511876,4.088132,-15.231386,92.217487,34.896824,545.183,61.0,57.635616
8905,259.0,757.449442,1318.053124,355.873830,401.575611,0.886194,21.173827,228.256523,76.513477,122.388,56.0,49.923288
8941,138.0,339.766667,585.413725,145.992157,193.776471,0.753405,18.209326,70.598145,28.467051,2688.114,55.0,72.512329
9019,118.0,457.367774,930.073801,148.154982,309.212792,0.479136,8.695780,99.085831,38.695728,0.000,43.0,65.073973


In [18]:
null
selected_rows
selected_row2['DEATH [d from CT]']=null

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_row2['DEATH [d from CT]']=null


In [19]:
X3=pd.concat([selected_rows,selected_row2])



In [20]:
y3=X3.iloc[:,9]
X3=X3.iloc[:, 0:9]
from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3  = train_test_split(X3, y3, test_size=0.2, random_state=0)

In [24]:
knn2 = KNeighborsRegressor(n_neighbors=9)
weight_options = ["uniform", "distance"]
param_grid2 = dict(weights = weight_options)
    
grid = GridSearchCV(knn2, param_grid2)
grid.fit(X_train3,y_train3)
p3=grid.predict(X_test3)
Mse = mean_squared_error(y_test3, p3
                         )

mape=mean_absolute_percentage_error(y_test3, p3)
print("RMSE: %.2f" % (Mse**(1/2.0)))
print("MApe: %.2f" %(mape))

mean_absolute_error(y_test3, p3)

RMSE: 194.44
MApe: 0.07


89.7835556471623