In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [3]:
oppScrData = pd.read_excel(r'sample_data/OppScrData.xlsx')

In [5]:
def clean_ct_data(oppScrData):
    # Delete rows with empty values
    cols = ["L1_HU_BMD", "TAT Area (cm2)", 'Total Body                Area EA (cm2)',
       'VAT Area (cm2)', 'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Age at CT','DEATH [d from CT]']
    ct_data= oppScrData[cols]
    n = ct_data.shape[0]
    preprocessed_ct_data = pd.DataFrame( columns=cols)
    for i in range(n):
        row = ct_data.loc[i]
        ignore = False
        for k,j in enumerate(row):
          if pd.isna(j) or j == ' ': 
            if k == 12:              
              ignore = False
            else:
              ignore = True
              break
        if not ignore:
          preprocessed_ct_data.loc[i]= row
    return preprocessed_ct_data

clinical_data = clean_ct_data(oppScrData=oppScrData)

In [6]:
clinical_data

Unnamed: 0,L1_HU_BMD,TAT Area (cm2),Total Body Area EA (cm2),VAT Area (cm2),SAT Area (cm2),VAT/SAT Ratio,Muscle HU,Muscle Area (cm2),L3 SMI (cm2/m2),AoCa Agatston,Liver HU (Median),Age at CT,DEATH [d from CT]
1,192.0,485.502857,694.314286,183.497143,302.008571,0.607589,16.150123,123.968745,48.413187,2709.064,52,55,359.0
2,256.0,490.054545,706.051515,159.706061,330.348485,0.483447,23.337964,136.566261,50.101332,0.000,54,52,
3,149.0,289.108108,584.489189,144.002703,145.102703,0.992419,30.804567,212.296726,69.115854,2586.575,57,60,2351.0
4,106.0,315.530769,588.892308,202.317949,113.212821,1.787059,-3.181874,168.923950,47.814581,431.519,53,88,658.0
5,94.0,247.412821,601.705128,145.653846,101.758974,1.431361,40.243137,174.813554,53.751548,35.760,54,68,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9217,148.0,420.046763,642.357864,157.105587,262.941176,0.597493,31.595440,165.413121,55.447848,0.000,38,52,
9218,167.0,185.183362,500.051686,92.813192,92.370170,1.004796,41.896333,188.676052,59.683360,139.967,54,51,
9219,135.0,822.287179,1154.605128,247.784615,574.502564,0.431303,17.102222,143.309862,57.786350,1115.997,45,63,
9220,219.0,418.200000,660.346154,213.748718,204.448718,1.045488,29.610469,138.824463,50.929786,3515.371,44,55,


**Model training**

In [7]:
sc = StandardScaler()

non_null_death_rows = clinical_data[~clinical_data['DEATH [d from CT]'].isnull()]

# Convert to years
non_null_death_rows['DEATH [d from CT]'] /= 365

X = non_null_death_rows.iloc[:, 0:12]
y = non_null_death_rows.iloc[:, 12]
X = pd.DataFrame(sc.fit_transform(X), columns = X.columns)

null_death_rows=clinical_data[clinical_data['DEATH [d from CT]'].isnull()]
X2 = null_death_rows.iloc[:, 0:12]
X2 = pd.DataFrame(sc.fit_transform(X2), columns = X2.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
RFparams2={'n_estimators':[50],
          'min_samples_split':[16],
          'min_samples_leaf': [4],
           'max_features': [6],
           'max_depth': [20],
           'bootstrap': [True]}

rf = RandomForestRegressor(random_state = 21)
clf = GridSearchCV(rf, RFparams2, refit=True)
clf.fit(X_train, y_train)
clf.best_estimator_.score(X_test, y_test)

0.034842850423695615

In [10]:
p = clf.predict(X_test)
Mse = mean_squared_error(y_test, p)
error = mean_absolute_error(y_test, p)
mape = mean_absolute_percentage_error(y_test, p)
print("RMSE : %.2f" % (Mse**(1/2.0)))
print("MApe : %.2f" %(mape))
print("MAE :", error)

RMSE : 3.24
MApe : 4.02
MAE : 2.686602856158458


**Fill death column**

In [11]:
null_death_rows['DEATH [d from CT]'] = clf.predict(X2)
df = pd.concat([non_null_death_rows,null_death_rows])
y = df.iloc[:,12]
X = df.iloc[:, 0:12]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


**Train model**

In [12]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2)

knn = KNeighborsRegressor(n_neighbors=9)
param_grid2 = dict(weights = ["uniform", "distance"])
    
grid = GridSearchCV(knn, param_grid2)
grid.fit(X_train, y_train)
p3 = grid.predict(X_test) 
Mse = mean_squared_error(y_test, p3)

mape=mean_absolute_percentage_error(y_test, p3)
print("RMSE: %.2f" % (Mse**(1/2.0)))
print("MApe: %.2f" %(mape))
print("MAE: %.2f" %(mean_absolute_error(y_test, p3)))


RMSE: 1.06
MApe: 0.19
MAE: 0.68
