# Build a model that can predict whether a patient (has) survived at least 2 years.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.pipeline import Pipeline

In [3]:
names=['survival','still_alive','age_at_heart_attack','pericardial_effusion','fractional_shortening','epss','lvdd','wall_motion_score','wall_motion_index','mult','name','group','alive_at_1']
df=pd.read_csv('echocardiogram.data',header=None,names=names)

In [4]:
df=df.replace('?',np.nan)

In [5]:
df.isna().sum()

survival                  0
still_alive               0
age_at_heart_attack       2
pericardial_effusion      0
fractional_shortening     7
epss                     14
lvdd                     10
wall_motion_score         3
wall_motion_index         1
mult                      3
name                      0
group                     2
alive_at_1               37
dtype: int64

In [None]:
plt.hist(df)
plt.show()

## As per the requirement the columns mult, name, group can be ignored

In [6]:
df=df.drop(['mult','name','group'],axis=1)

In [7]:
df['age_at_heart_attack']=df['age_at_heart_attack'].astype(float)
df['fractional_shortening']=df['fractional_shortening'].astype(float)
df['epss']=df['epss'].astype(float)
df['lvdd']=df['lvdd'].astype(float)
df['wall_motion_score']=df['wall_motion_score'].astype(float)
df['wall_motion_index']=df['wall_motion_index'].astype(float)
df['alive_at_1']=df['alive_at_1'].astype(float)

### Imputing null values

In [8]:
print(df.columns)
from sklearn.impute import SimpleImputer

Index(['survival', 'still_alive', 'age_at_heart_attack',
       'pericardial_effusion', 'fractional_shortening', 'epss', 'lvdd',
       'wall_motion_score', 'wall_motion_index', 'alive_at_1'],
      dtype='object')


In [9]:
imputer=SimpleImputer(strategy='median')

In [10]:
columns=df.columns
features=columns[1:]
df[features] =imputer.fit_transform(df[features])

In [11]:
df.isna().sum()

survival                 0
still_alive              0
age_at_heart_attack      0
pericardial_effusion     0
fractional_shortening    0
epss                     0
lvdd                     0
wall_motion_score        0
wall_motion_index        0
alive_at_1               0
dtype: int64

## Feature Generation

Target Variable Generation using Survival and Still Alive features and dropping the records which are not to be used for building the model as per the requirement.

In [12]:
df['alive_at_2']=df.apply(lambda x:1 if (x['survival']>=24 and x['still_alive']==1) else 1 if (x['survival']>=24 and x['still_alive']==0) else 0,axis=1)

df=df.drop(df[(df['survival']<24) & (df['still_alive']==1)].index)

In [13]:
df.columns

Index(['survival', 'still_alive', 'age_at_heart_attack',
       'pericardial_effusion', 'fractional_shortening', 'epss', 'lvdd',
       'wall_motion_score', 'wall_motion_index', 'alive_at_1', 'alive_at_2'],
      dtype='object')

## Feature Removal
#As the features survival,still_alive, alive_at_1 are null in the test data, we are eliminating those features from the model.

In [14]:
df=df.drop(['survival','still_alive','alive_at_1'],axis=1)


In [15]:
x=df.drop(['alive_at_2'],axis=1)
y=df['alive_at_2']

In [16]:
y.value_counts()

1    52
0    19
Name: alive_at_2, dtype: int64

In [17]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='minority',random_state=27)
X,Y = sm.fit_sample(x, y)

In [18]:
Y.value_counts()

1    52
0    52
Name: alive_at_2, dtype: int64

## Splitting into train and test data

In [19]:
from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold,cross_val_score
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y,random_state=50)

## Building the Model

In [20]:
from sklearn.ensemble import RandomForestClassifier


In [27]:
rfc=RandomForestClassifier(n_estimators=200)
rfc.fit(x_train,y_train)

RandomForestClassifier(n_estimators=200)

In [28]:
y_pred=rfc.predict(x_test)

In [29]:
from sklearn.metrics import f1_score,roc_auc_score
print(f1_score(y_pred,y_test))
print(roc_auc_score(y_pred,y_test))

0.7741935483870969
0.7823529411764706
