In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import scale
from scipy import stats

# Data Explanation
### Predictor:
#### num: diagnosis of heart disease (angiographic disease status) 
- Value 0: < 50% diameter narrowing 
- Value 1: > 50% diameter narrowing 

### Only 14 attributes used: 

#### age: age in years 
#### sex: sex (1 = male; 0 = female) 
#### cp: chest pain type 
- Value 1: typical angina 
- Value 2: atypical angina 
- Value 3: non-anginal pain 
- Value 4: asymptomatic 
#### trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
#### chol: serum cholestoral in mg/dl 
#### fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
#### restecg: resting electrocardiographic results 
- Value 0: normal 
- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
#### thalach: maximum heart rate achieved 
#### exang: exercise induced angina (1 = yes; 0 = no) 
#### oldpeak = ST depression induced by exercise relative to rest 
#### slope: the slope of the peak exercise ST segment 
- Value 1: upsloping 
- Value 2: flat 
- Value 3: downsloping 
#### ca: number of major vessels (0-3) colored by flourosopy 
#### thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 

In [2]:
data1 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data",header=None)
data2 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data',header=None,delim_whitespace=True)
data3 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None)
data4 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data',header=None)
data = data1.append(data2).append(data3).append(data4)
data.columns=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
data['num'] = data['num'].replace([1, 2, 3, 4, 5, 6], 1)
data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,1
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,1
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


In [3]:
data.shape

(303, 14)

In [4]:
data.isnull().sum().sum()

0

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null float64
cp          303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null float64
ca          303 non-null object
thal        303 non-null object
num         303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


In [6]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [12]:
print('The sum of all the ? symbols is',data.trestbps.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.chol.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.fbs.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.restecg.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.thalach.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.exang.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.oldpeak.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.slope.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.ca.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.thal.str.contains('\?').sum())

The sum of all the ? symbols is 4
The sum of all the ? symbols is 2


In [13]:
data.trestbps.replace('?',np.nan,inplace=True)
data.chol.replace('?',np.nan,inplace=True)
data.fbs.replace('?',np.nan,inplace=True)
data.restecg.replace('?',np.nan,inplace=True)
data.thalach.replace('?',np.nan,inplace=True)
data.exang.replace('?',np.nan,inplace=True)
data.oldpeak.replace('?',np.nan,inplace=True)
data.slope.replace('?',np.nan,inplace=True)
data.ca.replace('?',np.nan,inplace=True)
data.thal.replace('?',np.nan,inplace=True)
data.isnull().sum().sum()

6

In [14]:
#data.dropna(inplace=True)
#data.isnull().sum().sum()

In [15]:
data.drop_duplicates(inplace=True)

In [16]:
np.asarray(data.trestbps.dropna(), dtype=np.float).mean()

131.68976897689768

In [17]:
data.trestbps.fillna(np.asarray(data.trestbps.dropna(), dtype=np.float).mean(),inplace=True)

In [18]:
print(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)).mode)

[0]


In [19]:
np.asarray(data.chol.dropna(), dtype=np.float).mean()

246.69306930693068

In [20]:
data.chol.fillna(np.asarray(data.chol.dropna(), dtype=np.float).mean(),inplace=True)

In [21]:
data.isnull().sum().sum()

6

In [22]:
print(stats.mode(np.asarray(data.fbs.dropna(),dtype= np.int)).mode[0])

0


In [23]:
data.fbs.fillna(stats.mode(np.asarray(data.fbs.dropna(),dtype= np.int)).mode[0],inplace=True)

In [24]:
data.isnull().sum().sum()

6

In [25]:
print(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)))

ModeResult(mode=array([0]), count=array([151]))


In [26]:
print(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)).mode[0])
data.restecg.fillna(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)).mode[0],inplace=True)

0


In [27]:
data.isnull().sum().sum()

6

In [28]:
np.asarray(data.thalach.dropna(), dtype=np.float).mean()

149.6072607260726

In [29]:
data.thalach.fillna(np.asarray(data.thalach.dropna(), dtype=np.float).mean(),inplace=True)

In [30]:
data.isnull().sum().sum()

6

In [31]:
print(stats.mode(np.asarray(data.exang.dropna(),dtype= np.int)))

ModeResult(mode=array([0]), count=array([204]))


In [32]:
print(stats.mode(np.asarray(data.exang.dropna(),dtype= np.int)).mode[0])
data.exang.fillna(stats.mode(np.asarray(data.exang.dropna(),dtype= np.int)).mode[0],inplace=True)

0


In [33]:
data.isnull().sum().sum()

6

In [34]:
np.asarray(data.oldpeak.dropna(), dtype=np.float).mean()

1.0396039603960396

In [35]:
data.oldpeak.fillna(np.asarray(data.oldpeak.dropna(), dtype=np.float).mean(),inplace=True)

In [36]:
data.isnull().sum().sum()

6

In [37]:
print(stats.mode(np.asarray(data.slope.dropna(),dtype= np.int)))

ModeResult(mode=array([1]), count=array([142]))


In [38]:
print(stats.mode(np.asarray(data.slope.dropna(),dtype= np.int)).mode[0])
data.slope.fillna(stats.mode(np.asarray(data.slope.dropna(),dtype= np.int)).mode[0],inplace=True)

1


In [39]:
data.isnull().sum().sum()

6

In [40]:
print(stats.mode(np.asarray(data.ca.dropna(),dtype= np.float)))

ModeResult(mode=array([0.]), count=array([176]))


In [41]:
print(stats.mode(np.asarray(data.ca.dropna(),dtype= np.float)).mode[0])
data.ca.fillna(stats.mode(np.asarray(data.ca.dropna(),dtype= np.float)).mode[0],inplace=True)

0.0


In [42]:
data.isnull().sum().sum()

2

In [43]:
print(stats.mode(np.asarray(data.thal.dropna(),dtype= np.float)))

ModeResult(mode=array([3.]), count=array([166]))


In [44]:
print(stats.mode(np.asarray(data.thal.dropna(),dtype= np.float)).mode[0])
data.thal.fillna(stats.mode(np.asarray(data.thal.dropna(),dtype= np.float)).mode[0],inplace=True)

3.0


In [53]:
data.isnull().sum().sum()

0

In [46]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [47]:
# converting all the data to numeric data type
data.trestbps=pd.to_numeric(data.trestbps)
data.fbs=pd.to_numeric(data.fbs)
data.chol=pd.to_numeric(data.chol)
data.restecg=pd.to_numeric(data.restecg)
data.thalach=pd.to_numeric(data.thalach)
data.exang=pd.to_numeric(data.exang)
data.oldpeak=pd.to_numeric(data.oldpeak)
data.slope=pd.to_numeric(data.slope)
data.ca=pd.to_numeric(data.ca)
data.thal=pd.to_numeric(data.thal)

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null float64
cp          303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null float64
ca          303 non-null float64
thal        303 non-null float64
num         303 non-null int64
dtypes: float64(13), int64(1)
memory usage: 35.5 KB


In [49]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.663366,4.722772,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.934375,1.938383,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [50]:
data.corr()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
age,1.0,-0.097542,0.104139,0.284946,0.20895,0.11853,0.148868,-0.393806,0.091661,0.203805,0.16177,0.365323,0.128303,0.22312
sex,-0.097542,1.0,0.010084,-0.064456,-0.199915,0.047862,0.021647,-0.048663,0.146201,0.102173,0.037533,0.086048,0.380581,0.276816
cp,0.104139,0.010084,1.0,-0.036077,0.072319,-0.039975,0.067505,-0.334422,0.38406,0.202277,0.15205,0.233117,0.262089,0.414446
trestbps,0.284946,-0.064456,-0.036077,1.0,0.13012,0.17534,0.14656,-0.045351,0.064762,0.189171,0.117382,0.097528,0.134424,0.150825
chol,0.20895,-0.199915,0.072319,0.13012,1.0,0.009841,0.171043,-0.003432,0.06131,0.046564,-0.004062,0.123726,0.018351,0.085164
fbs,0.11853,0.047862,-0.039975,0.17534,0.009841,1.0,0.069564,-0.007854,0.025665,0.005747,0.059894,0.140764,0.064625,0.025264
restecg,0.148868,0.021647,0.067505,0.14656,0.171043,0.069564,1.0,-0.083389,0.084867,0.114133,0.133946,0.131749,0.024325,0.169202
thalach,-0.393806,-0.048663,-0.334422,-0.045351,-0.003432,-0.007854,-0.083389,1.0,-0.378103,-0.343085,-0.385601,-0.265699,-0.274142,-0.417167
exang,0.091661,0.146201,0.38406,0.064762,0.06131,0.025665,0.084867,-0.378103,1.0,0.288223,0.257748,0.145788,0.32524,0.431894
oldpeak,0.203805,0.102173,0.202277,0.189171,0.046564,0.005747,0.114133,-0.343085,0.288223,1.0,0.577537,0.301067,0.342405,0.42451


In [51]:
data = pd.concat([data, pd.get_dummies(data["restecg"],prefix="resg")], axis=1)
data = pd.concat([data,pd.get_dummies(data["slope"],prefix="slope")], axis=1)
data = pd.concat([data,pd.get_dummies(data["thal"],prefix="thal")], axis=1)
data = pd.concat([data,pd.get_dummies(data["cp"],prefix="cp")], axis=1)
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num', 'resg_0.0',
       'resg_1.0', 'resg_2.0', 'slope_1.0', 'slope_2.0', 'slope_3.0',
       'thal_3.0', 'thal_6.0', 'thal_7.0', 'cp_1.0', 'cp_2.0', 'cp_3.0',
       'cp_4.0'],
      dtype='object')

In [52]:
X = data.loc[:,['age', 'sex',  'trestbps', 'chol', 'fbs',  'thalach',
       'exang', 'oldpeak', 'ca',  'thal_-9.0',
       'thal_3.0', 'thal_6.0', 'thal_7.0', 'resg_-9.0', 'resg_0.0', 'resg_1.0',
       'resg_2.0', 'slope_-9.0', 'slope_1.0', 'slope_2.0', 'slope_3.0',
       'thal_-9.0', 'thal_3.0', 'thal_6.0', 'thal_7.0', 'cp_1.0', 'cp_2.0',
       'cp_3.0', 'cp_4.0']]
X_scaled = pd.DataFrame(scale(X))
pca= PCA()
X_pca = pd.DataFrame(pca.fit_transform(X))
y = data.num
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=10)
X_train_scaled,X_test_scaled,y_train_scaled,y_test_scaled = train_test_split(X_scaled,y, test_size=0.3,random_state=10)
X_train_pca,X_test_pca,y_train_pca,y_test_pca = train_test_split(X_pca,y, test_size=0.3,random_state=10)

#y = pd.DataFrame(scale(y))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [54]:
model_logr = LogisticRegression()
model_logr.fit(X_train,y_train)
y_predict = model_logr.predict(X_test)
print(pd.crosstab(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))

NameError: name 'X_train' is not defined

In [55]:
model_logr = LogisticRegression()
model_logr.fit(X_train_scaled,y_train_scaled)
y_predict = model_logr.predict(X_test_scaled)
print(pd.crosstab(y_test_scaled,y_predict))
print(confusion_matrix(y_test_scaled,y_predict))
print(accuracy_score(y_test_scaled,y_predict))

NameError: name 'X_train_scaled' is not defined

In [66]:
model_logr = LogisticRegression()
model_logr.fit(X_train_pca,y_train_pca)
y_predict = model_logr.predict(X_test_pca)
print(pd.crosstab(y_test_pca,y_predict))
print(confusion_matrix(y_test_pca,y_predict))
print(accuracy_score(y_test_pca,y_predict))

col_0    0    1
num            
0      101   30
1       18  127
[[101  30]
 [ 18 127]]
0.8260869565217391


In [67]:
parameters = {'n_estimators': [4,6,9],
             'max_features': ['log2', 'sqrt', 'auto',3],
             'criterion': ['entropy', 'gini'],
             'max_depth': [2, 3, 5, 10],
             'min_samples_split': [2, 3, 5],
             'min_samples_leaf': [1, 5, 8]}

acc_scorer = make_scorer(accuracy_score)
clf = RandomForestClassifier()
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

#model_rfc = RandomForestClassifier(max_features=3,random_state=4)
#model_rfc.fit(X_train,y_train)
#y_predict = model_rfc.predict(X_test)
#print(accuracy_score(y_test,y_predict))
#print(pd.crosstab(y_test,y_predict))
#print(confusion_matrix(y_test,y_predict))

0.8043478260869565


In [84]:
#model_rfc = RandomForestClassifier(max_features=3,random_state=4)
#model_rfc.fit(X_train_scaled,y_train_scaled)

parameters = {'max_features': [3],
             
             'random_state': [4]}

acc_scorer = make_scorer(accuracy_score)
clf = RandomForestClassifier()
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train_scaled, y_train_scaled)
model_rfc = grid_obj.best_estimator_
model_rfc.fit(X_train_scaled, y_train_scaled)

y_predict = model_rfc.predict(X_test_scaled)
print(accuracy_score(y_test_scaled,y_predict))
print(pd.crosstab(y_test_scaled,y_predict))
print(confusion_matrix(y_test_scaled,y_predict))

0.8369565217391305
col_0    0    1
num            
0      107   24
1       21  124
[[107  24]
 [ 21 124]]


In [69]:
parameters = {'n_estimators': [4,6,9],
             'max_features': ['log2', 'sqrt', 'auto',3],
             'criterion': ['entropy', 'gini'],
             'max_depth': [2, 3, 5, 10],
             'min_samples_split': [2, 3, 5],
             'min_samples_leaf': [1, 5, 8]}

acc_scorer = make_scorer(accuracy_score)
clf = RandomForestClassifier()
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train_pca, y_train_pca)
clf = grid_obj.best_estimator_
clf.fit(X_train_pca, y_train_pca)

predictions = clf.predict(X_test_pca)
print(accuracy_score(y_test_pca, predictions))

0.7789855072463768


In [70]:
model_knn = KNeighborsClassifier(n_neighbors = 2, metric='euclidean')
model_knn.fit(X_train,y_train)
y_predict = model_knn.predict(X_test)
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))

[[97 34]
 [67 78]]
0.6340579710144928


In [71]:
model_knn = KNeighborsClassifier(n_neighbors = 2, metric='euclidean')
model_knn.fit(X_train_scaled,y_train_scaled)
y_predict = model_knn.predict(X_test_scaled)
print(confusion_matrix(y_test_scaled,y_predict))
print(accuracy_score(y_test_scaled,y_predict))

[[112  19]
 [ 53  92]]
0.7391304347826086


In [72]:
model_knn = KNeighborsClassifier(n_neighbors = 2, metric='euclidean')
model_knn.fit(X_train_pca,y_train_pca)
y_predict = model_knn.predict(X_test_pca)
print(confusion_matrix(y_test_pca,y_predict))
print(accuracy_score(y_test_pca,y_predict))

[[97 34]
 [67 78]]
0.6340579710144928


In [73]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_predict = model_dt.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.7355072463768116
[[ 95  36]
 [ 37 108]]


In [74]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_scaled,y_train_scaled)
y_predict = model_dt.predict(X_test_scaled)
print(accuracy_score(y_test_scaled,y_predict))
print(confusion_matrix(y_test_scaled,y_predict))

0.7391304347826086
[[ 94  37]
 [ 35 110]]


In [75]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_pca,y_train_pca)
y_predict = model_dt.predict(X_test_pca)
print(accuracy_score(y_test_pca,y_predict))
print(confusion_matrix(y_test_pca,y_predict))

0.7318840579710145
[[ 94  37]
 [ 37 108]]


In [76]:
model = SVC(C=1000.0, kernel='rbf', gamma=0.1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(pd.crosstab(y_test,y_predict))

0.5362318840579711
col_0  0    1
num          
0      3  128
1      0  145


In [77]:
model = SVC(C=1000.0, kernel='rbf', gamma=0.1)
model.fit(X_train_scaled, y_train_scaled)
y_predict = model.predict(X_test_scaled)
print(accuracy_score(y_test_scaled,y_predict))
print(pd.crosstab(y_test_scaled,y_predict))

0.7971014492753623
col_0   0    1
num           
0      99   32
1      24  121
