In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
def fill_death_col(clinical_data):
  
  male_obese = (clinical_data['Sex'] == 1) & (clinical_data['BMI'] > 30)
  female_obese = (clinical_data['Sex'] == -1) & (clinical_data['BMI'] > 30)
  male_underweight = (clinical_data['Sex'] == 1) & (clinical_data['BMI'] < 18.5)
  female_underweight = (clinical_data['Sex'] == -1) & (clinical_data['BMI'] < 18.5)
  clinical_data['BMI_reduction_factor'] = 0
  clinical_data.loc[male_obese, 'BMI_reduction_factor'] = 4.2
  clinical_data.loc[female_obese, 'BMI_reduction_factor'] = 3.5
  clinical_data.loc[male_underweight, 'BMI_reduction_factor'] = 4.3
  clinical_data.loc[female_underweight, 'BMI_reduction_factor'] = 4.5

  clinical_data['Tobacco_reduction_factor'] = np.where(clinical_data['Tobacco']==1,10,0)

  #Leave na on rows with no death data, fill the rest with age*365+death days 
  #clinical_data['Age_at_death'] = np.where(clinical_data['DEATH [d from CT]'].isna(), np.nan, clinical_data['Age at CT']*365 + clinical_data['DEATH [d from CT]'])
  
  # fill death with avg lifespan - alpha(reduction for tobacco) - beta(reduction for bmi)
  #corr_bmi_death = clinical_data['BMI'].corr(clinical_data['Age_at_death'])
  #corr_tobacco_death = clinical_data['Tobacco'].corr(clinical_data['Age_at_death'])

 # print(corr_bmi_death)
  #print(corr_tobacco_death)

 # alpha=corr_bmi_death
  #beta=corr_tobacco_death
  #normalize
#   alpha = corr_bmi_death/(corr_bmi_death+corr_tobacco_death)
#   beta = corr_tobacco_death/(corr_bmi_death+corr_tobacco_death)

  #print(alpha)
  #print(beta)
  
  avg_lifespan_male=76.3  
  avg_lifespan_female=81.2

  #clinical_data['avg_life'] = np.where(clinical_data['Sex']==1, avg_lifespan_male, avg_lifespan_female)
  #clinical_data['avg_life'] = clinical_data['avg_life'] + alpha*(clinical_data['BMI_reduction_factor'])+ beta*(clinical_data['Tobacco_reduction_factor'])
  #clinical_data['DEATH [d from CT]'].fillna((clinical_data['avg_life']-clinical_data['Age at CT'])*365, inplace=True)
  

In [3]:
def preprocess_clinical_data(oppScrData, mean=True):
    clinical_data = oppScrData.filter(['Clinical F/U interval  [d from CT]','BMI','BMI >30', 'Sex', 'Tobacco', 'Met Sx', 'FRAX 10y Fx Prob (Orange-w/ DXA)',
                                'FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)', 'DEATH [d from CT]' , 'Age at CT'], axis=1)
    # Replace all _,X,blanks with nan
    clinical_data = clinical_data.replace(r'_', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'X', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'^\s*$', np.nan, regex=True)

    # Fill na in bmi column with mean
    clinical_data['BMI'].fillna(value=clinical_data['BMI'].mean(skipna=True), inplace=True)
    
    # Fill na in bmi>30 column based on bmi col
    clinical_data.loc[clinical_data.BMI>30, 'BMI >30'] = 1
    clinical_data.loc[clinical_data.BMI<=30, 'BMI >30'] = -1
    
    clinical_data['Sex'] = np.where(clinical_data['Sex']=='Male',1,-1)
    clinical_data['Met Sx'] = np.where(clinical_data['Met Sx']=='Y',1,-1) 

    # Treat no data in tobacco as no tobacco usage 
    clinical_data['Tobacco'] = np.where(clinical_data['Tobacco']=='Yes',1,-1) 
    
    #Fill death col
    #fill_death_col(clinical_data)

    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace("<1", 0.01, regex=True)
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace(">30", 0.30, regex=True)
    clinical_data['FRS 10-year risk (%)'] =  clinical_data['FRS 10-year risk (%)'] * 100
 
    cols_to_be_filled = ['FRAX 10y Fx Prob (Orange-w/ DXA)','FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)']
    for c in cols_to_be_filled:
      if mean:  
        clinical_data[c].fillna(value=clinical_data[c].mean(skipna=True), inplace=True)
      else :
        clinical_data[c].fillna(value=clinical_data[c].median(skipna=True), inplace=True)

    return [clinical_data, np.array(clinical_data, dtype=np.float32)]

In [5]:
oppScrData = pd.read_excel (r'OppScrData.xlsx')  

In [6]:
clinical_data, clinical_data_np = preprocess_clinical_data(oppScrData)

In [8]:
clinical_data

Unnamed: 0,Clinical F/U interval [d from CT],BMI,BMI >30,Sex,Tobacco,Met Sx,FRAX 10y Fx Prob (Orange-w/ DXA),FRAX 10y Hip Fx Prob (Orange-w/ DXA),FRS 10-year risk (%),DEATH [d from CT],Age at CT
0,907,37.7,1,1,-1,-1,4.60,1.04,3.807755,967.0,73
1,359,30.1,1,-1,-1,-1,5.01,0.27,3.807755,359.0,55
2,412,28.1,-1,-1,-1,-1,4.13,0.21,3.807755,,52
3,2333,25.0,-1,1,1,-1,5.46,0.75,6.000000,2351.0,60
4,532,22.4,-1,1,1,-1,11.65,8.40,16.000000,658.0,88
...,...,...,...,...,...,...,...,...,...,...,...
9217,603,28.3,-1,1,-1,-1,4.97,0.27,5.000000,,52
9218,564,23.4,-1,1,1,-1,3.30,0.25,6.000000,,51
9219,614,43.6,1,-1,-1,1,8.26,0.34,2.000000,,63
9220,365,26.9,-1,-1,-1,-1,5.45,0.32,1.000000,,55


In [9]:
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV

clinical_data

Unnamed: 0,Clinical F/U interval [d from CT],BMI,BMI >30,Sex,Tobacco,Met Sx,FRAX 10y Fx Prob (Orange-w/ DXA),FRAX 10y Hip Fx Prob (Orange-w/ DXA),FRS 10-year risk (%),DEATH [d from CT],Age at CT
0,907,37.7,1,1,-1,-1,4.60,1.04,3.807755,967.0,73
1,359,30.1,1,-1,-1,-1,5.01,0.27,3.807755,359.0,55
2,412,28.1,-1,-1,-1,-1,4.13,0.21,3.807755,,52
3,2333,25.0,-1,1,1,-1,5.46,0.75,6.000000,2351.0,60
4,532,22.4,-1,1,1,-1,11.65,8.40,16.000000,658.0,88
...,...,...,...,...,...,...,...,...,...,...,...
9217,603,28.3,-1,1,-1,-1,4.97,0.27,5.000000,,52
9218,564,23.4,-1,1,1,-1,3.30,0.25,6.000000,,51
9219,614,43.6,1,-1,-1,1,8.26,0.34,2.000000,,63
9220,365,26.9,-1,-1,-1,-1,5.45,0.32,1.000000,,55


In [10]:
selected_rows = clinical_data[~clinical_data['DEATH [d from CT]'].isnull()]
selected_rows

selected_row2=clinical_data[clinical_data['DEATH [d from CT]'].isnull()]

X=selected_rows.iloc[:, 0:9]
X
X2=selected_row2.iloc[:, 0:9]

y=selected_rows.iloc[:, 9]
y
y2=selected_row2.iloc[:, 9]

from sklearn.preprocessing import MinMaxScaler,StandardScaler
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X), columns = X.columns)
X2=pd.DataFrame(sc.fit_transform(X2), columns = X.columns)
X
X2

Unnamed: 0,Clinical F/U interval [d from CT],BMI,BMI >30,Sex,Tobacco,Met Sx,FRAX 10y Fx Prob (Orange-w/ DXA),FRAX 10y Hip Fx Prob (Orange-w/ DXA),FRS 10-year risk (%)
0,-1.740305,-0.109551,-0.697954,-0.879969,-0.805323,-0.296074,-0.388895,-0.331133,0.044942
1,0.589774,-0.639414,-0.697954,1.136404,1.241737,-0.296074,0.286312,0.564507,1.713275
2,0.991651,0.030707,-0.697954,-0.879969,-0.805323,-0.296074,4.346680,4.244416,0.096738
3,0.927378,-0.140720,-0.697954,1.136404,1.241737,-0.296074,-0.121245,-0.045567,1.713275
4,0.920612,1.402117,1.432759,1.136404,1.241737,-0.296074,-0.467973,-0.149409,1.174429
...,...,...,...,...,...,...,...,...,...
8668,-1.611081,-0.078383,-0.697954,1.136404,-0.805323,-0.296074,-0.133411,-0.292192,0.366160
8669,-1.637467,-0.842009,-0.697954,1.136404,1.241737,-0.296074,-0.641337,-0.305172,0.635583
8670,-1.603639,2.306001,1.432759,-0.879969,-0.805323,3.377532,0.867234,-0.246761,-0.442108
8671,-1.772103,-0.296562,-0.697954,-0.879969,-0.805323,-0.296074,0.012580,-0.259741,-0.711531


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=0)



In [12]:
RFmod2 = RandomForestRegressor(random_state=21)

#Choose some hyperparameter values 
param_grid2 = {
    'bootstrap': [True],
    'max_depth': [10,20,30],
    'max_features': [2, 3,4,5,6],
    'min_samples_leaf': [3, 4, 5,6,7,8,9,10],
    'min_samples_split': [8, 10, 12,14,16,18,20],
    'n_estimators': [50,30,40,20,10,100, 200, 300, 1000]
}
clfRF2 = RandomizedSearchCV(RFmod2,param_grid2,#model and parameters
                             #number of cross validation folds
                             )


clfRF2.fit(X_train,y_train)
params3=clfRF2.best_params_
params3

{'n_estimators': 30,
 'min_samples_split': 10,
 'min_samples_leaf': 3,
 'max_features': 6,
 'max_depth': 20,
 'bootstrap': True}

In [13]:
RFparams2={'n_estimators':[50],
          'min_samples_split':[16],
          'min_samples_leaf': [4],
           'max_features': [6],
           'max_depth': [20],
           'bootstrap': [True]}

rf = RandomForestRegressor(random_state = 21)


clf= GridSearchCV(rf,RFparams2,refit=True)
clf.fit(X_train,y_train)
        
clf.best_estimator_.score(X_test, y_test)

0.9675514805768979

In [14]:
p=clf.predict(X_test)
null=clf.predict(X2)


In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [16]:
Mse = mean_squared_error(y_test, p
                         )
error=mean_absolute_error(y_test, p)
mape=mean_absolute_percentage_error(y_test, p)
print("RMSE: %.2f" % (Mse**(1/2.0)))
print("MApe: %.2f" %(mape))
print("error",error)

RMSE: 237.25
MApe: 0.55
error 160.64980233315563


In [17]:
from sklearn.neighbors import KNeighborsRegressor

In [18]:
null
selected_rows
selected_row2['DEATH [d from CT]']=null

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_row2['DEATH [d from CT]']=null


In [19]:
X3=pd.concat([selected_rows,selected_row2])



In [20]:
y3=X3.iloc[:,9]
X3=X3.iloc[:, 0:9]
from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3  = train_test_split(X3, y3, test_size=0.2, random_state=0)

In [24]:
knn2 = KNeighborsRegressor(n_neighbors=9)
weight_options = ["uniform", "distance"]
param_grid2 = dict(weights = weight_options)
    
grid = GridSearchCV(knn2, param_grid2)
grid.fit(X_train3,y_train3)
p3=grid.predict(X_test3)
Mse = mean_squared_error(y_test3, p3
                         )

mape=mean_absolute_percentage_error(y_test3, p3)
print("RMSE: %.2f" % (Mse**(1/2.0)))
print("MApe: %.2f" %(mape))

mean_absolute_error(y_test3, p3)

RMSE: 194.44
MApe: 0.07


89.7835556471623