In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [5]:
oppScrData = pd.read_excel (r'sample_data/OppScrData.xlsx')  

In [9]:
def preprocess_data(oppScrData, mean=True):
    
    l = ['BMI','BMI >30', 'Sex', 'Tobacco', 'Met Sx', 'Age at CT', 'CVD DX', 'Heart failure DX',
                                       'MI DX','Type 2 Diabetes DX', 'Primary Cancer Site', 'Humerus fracture DX',
                                       'Femoral neck fracture DX', 'Unspec femoral fracture DX', 'Forearm fracture DX'
                                       ,'Primary Cancer Site 2','Pathologic fracture DX','DEATH [d from CT]']

    clinical_data = oppScrData.filter(l, axis=1)
    # Replace all _,X,blanks with nan
    clinical_data = clinical_data.replace(r'_', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'X', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'^\s*$', np.nan, regex=True)

    # Fill na in bmi column with mean
    clinical_data['BMI'].fillna(value=clinical_data['BMI'].mean(skipna=True), inplace=True)
    
    # Fill na in bmi>30 column based on bmi col
    clinical_data.loc[clinical_data.BMI>30, 'BMI >30'] = 1
    clinical_data.loc[clinical_data.BMI<=30, 'BMI >30'] = -1
    
    clinical_data['Sex'] = np.where(clinical_data['Sex']=='Male',1,-1)
    clinical_data['Met Sx'] = np.where(clinical_data['Met Sx']=='Y',1,-1) 

    # Treat no data in tobacco as no tobacco usage
    clinical_data['Tobacco'] = np.where(clinical_data['Tobacco']=='Yes',1,-1) 

    for col in l[l.index('CVD DX'):l.index('DEATH [d from CT]')]:      
      clinical_data.loc[clinical_data[col].notna(), col] = 1
      clinical_data[col].fillna(value=0, inplace=True)
  
    return clinical_data

In [10]:
clinical_data = preprocess_data(oppScrData)

In [11]:
sc = StandardScaler()

non_null_death_rows = clinical_data[~clinical_data['DEATH [d from CT]'].isnull()]

# Convert to years
non_null_death_rows['DEATH [d from CT]'] /= 365

m = non_null_death_rows.columns.get_loc('DEATH [d from CT]')

X = non_null_death_rows.iloc[:, 0:m]
y = non_null_death_rows.iloc[:, m]
X = pd.DataFrame(sc.fit_transform(X), columns = X.columns)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


**Train**

In [12]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2)

knn = KNeighborsRegressor()
param_grid2 = dict(weights = ["uniform", "distance"], n_neighbors=list(range(2, 14)))
    
grid = GridSearchCV(knn, param_grid2)
grid.fit(X_train, y_train)
p3 = grid.predict(X_test)
print("MAE: %.2f" %(mean_absolute_error(y_test, p3)))
print(grid.best_params_)


MAE: 3.13
{'n_neighbors': 13, 'weights': 'uniform'}


**Fill death column**

In [None]:
clinical_data = preprocess_data(oppScrData)

for idx, row in clinical_data.iterrows():  
  if np.isnan(clinical_data.at[idx,'DEATH [d from CT]']):
    l = ['BMI','BMI >30', 'Sex', 'Tobacco', 'Met Sx', 'Age at CT', 'CVD DX', 'Heart failure DX',
                                       'MI DX','Type 2 Diabetes DX', 'Primary Cancer Site', 'Humerus fracture DX',
                                       'Femoral neck fracture DX', 'Unspec femoral fracture DX', 'Forearm fracture DX'
                                       ,'Primary Cancer Site 2','Pathologic fracture DX']
    row = clinical_data.iloc[idx]     
    r = row.filter(l).to_numpy().reshape(1, -1) 
    clinical_data.at[idx,'DEATH [d from CT]'] = (grid.predict(r))
  else:
    clinical_data.at[idx,'DEATH [d from CT]'] /= 365

In [14]:
oppScrData['DEATH [d from CT]'] = clinical_data['DEATH [d from CT]']

**Create X and Y for prediction**

In [19]:
def preprocess_clinical_data(oppScrData, mean=True):
    clinical_data = oppScrData.filter(['Record ID','BMI','BMI >30', 'Clinical F/U interval  [d from CT]', 'Sex', 'Tobacco', 'Met Sx', 'FRAX 10y Fx Prob (Orange-w/ DXA)',
                                'FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)','Age at CT' ], axis=1)
    # Replace all _,X,blanks with nan
    clinical_data = clinical_data.replace(r'_', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'X', np.nan, regex=True)
    clinical_data = clinical_data.replace(r'^\s*$', np.nan, regex=True)

    # Fill na in bmi column with mean
    clinical_data['BMI'].fillna(value=clinical_data['BMI'].mean(skipna=True), inplace=True)
    
    # Fill na in bmi>30 column based on bmi col
    clinical_data.loc[clinical_data.BMI>30, 'BMI >30'] = 1
    clinical_data.loc[clinical_data.BMI<=30, 'BMI >30'] = -1
    
    clinical_data['Sex'] = np.where(clinical_data['Sex']=='Male',1,-1)
    clinical_data['Met Sx'] = np.where(clinical_data['Met Sx']=='Y',1,-1) 

    # Treat no data in tobacco as no tobacco usage 
    clinical_data['Tobacco'] = np.where(clinical_data['Tobacco']=='Yes',1,-1) 
  
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace("<1", 0.01, regex=True)
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace(">30", 0.30, regex=True)
    clinical_data['FRS 10-year risk (%)'] =  clinical_data['FRS 10-year risk (%)'] * 100
   
    cols_to_be_filled = ['FRAX 10y Fx Prob (Orange-w/ DXA)','FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)']
    for c in cols_to_be_filled:
      if mean:  
        clinical_data[c].fillna(value=clinical_data[c].mean(skipna=True), inplace=True)
      else :
        clinical_data[c].fillna(value=clinical_data[c].median(skipna=True), inplace=True)
    
    return clinical_data

In [18]:
def clean_ct_data(oppScrData):
    # Delete rows with empty values
    cols = ["Record ID", "L1_HU_BMD", "TAT Area (cm2)", 'Total Body                Area EA (cm2)',
       'VAT Area (cm2)', 'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)','DEATH [d from CT]']
    ct_data= oppScrData[cols]
    n = ct_data.shape[0]
    preprocessed_ct_data = pd.DataFrame( columns=cols)
    for i in range(n):
        row = ct_data.loc[i]
        ignore = False
        for k,j in enumerate(row):
          if pd.isna(j) or j == ' ': 
            if k == 12:              
              ignore = False
            else:
              ignore = True
              break
        if not ignore:
          preprocessed_ct_data.loc[i]= row
    return preprocessed_ct_data

In [20]:
clinical_data = preprocess_clinical_data(oppScrData)
ct_data = clean_ct_data(oppScrData)
combined_data = clinical_data.merge(ct_data, on='Record ID', how='inner' )
combined_data.drop(columns=['Record ID'], inplace=True)
combined_data.drop(columns=['Clinical F/U interval  [d from CT]'], inplace=True)

**Build regressor**

In [24]:
y_idx = combined_data.columns.get_loc("DEATH [d from CT]")
y = combined_data.iloc[:,y_idx]
X = combined_data.iloc[:, 0:y_idx]

In [26]:
y

0       0.983562
1       4.617914
2       6.441096
3       1.802740
4       3.941201
          ...   
8873    4.442150
8874    4.060485
8875    4.220653
8876    4.373446
8877    3.771338
Name: DEATH [d from CT], Length: 8878, dtype: float64

In [27]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2)

knn = KNeighborsRegressor(n_neighbors=9)
param_grid2 = dict(weights = ["uniform", "distance"])
    
grid = GridSearchCV(knn, param_grid2)
grid.fit(X_train, y_train)
p3 = grid.predict(X_test) 
Mse = mean_squared_error(y_test, p3)

mape=mean_absolute_percentage_error(y_test, p3)
print("RMSE: %.2f" % (Mse**(1/2.0)))
print("MApe: %.2f" %(mape))
print("MAE: %.2f" %(mean_absolute_error(y_test, p3)))


RMSE: 1.14
MApe: 0.37
MAE: 0.53
