# classification

In [1]:
import pandas as pd
from scipy.io.arff import loadarff
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.preprocessing import LabelEncoder
import re,os

#feature selection
from sklearn.feature_selection import SelectFromModel
from imblearn.datasets import fetch_datasets

#model selection imports
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold

#algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

#metrics
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score

PATH = "data/"

# pima diabetes

In [17]:
df = pd.read_csv(os.path.join(PATH,"pima/diabetes.csv"))
y = df['Outcome']
df = df.drop(['Outcome'],axis=1)
X_train,X_test,y_train,y_test = train_test_split(df,y,random_state=42,test_size=0.3)

In [18]:
dt_clf = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf,df,y,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf,df,y,cv=10,scoring='f1')))

0.7018284347231716
0.5660823367518942


In [19]:
svm_clf = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf,df,y,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf,df,y,cv=10,scoring='f1')))

0.7382775119617225
0.551915423751622


In [20]:
dt_clf.fit(X_train,y_train)
sel = SelectFromModel(dt_clf,prefit=True)
# sel.fit(df,y)
print("features selected: ",sum(sel.get_support()))
selected_feat_df = df.loc[:,sel.get_support()]
# dt_clf.fit(X_train,y_train)
selected_feat_df['dt_output']=dt_clf.predict(df)
print(np.mean(cross_val_score(svm_clf,selected_feat_df,y,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf,selected_feat_df,y,cv=10,scoring='f1')))

features selected:  3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.7747095010252905
0.6432295924116992


# SPECT images

In [2]:
df_spect = pd.read_csv(os.path.join(PATH,"spect/SPECT.train"),header=None)
y_train_spect = df_spect[0]
X_train_spect = df_spect.drop([0],axis=1)
df_spect_test = pd.read_csv(os.path.join(PATH,"spect/SPECT.test"),header=None)
y_test_spect = df_spect_test[0]
X_test_spect = df_spect_test.drop([0],axis=1)
df_spect = df_spect.append(df_spect_test,ignore_index=True)
df_spect = df_spect.drop([0],axis=1)
y_spect = y_train_spect.append(y_test_spect,ignore_index=True)

In [22]:
dt_clf_spect = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_spect,df_spect,y_spect,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_spect,df_spect,y_spect,cv=10,scoring='f1')))

0.7598087098087098
0.837036168734322


In [25]:
svm_clf_spect = SVC(random_state=42,gamma='scale')
print(np.mean(cross_val_score(svm_clf_spect,df_spect,y_spect,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_spect,df_spect,y_spect,cv=10,scoring='f1')))

0.823819698819699
0.8904873222018754


In [26]:
dt_clf_spect.fit(X_train_spect,y_train_spect)
sel = SelectFromModel(dt_clf_spect,prefit=True)
# sel.fit(df_spect,y_spect)
print("features selected: ",sum(sel.get_support()))
selected_feat_df_spect = df_spect.loc[:,sel.get_support()]
# dt_clf_spect.fit(X_train_spect,y_train_spect)
selected_feat_df_spect['dt_output']=dt_clf_spect.predict(df_spect)
print(np.mean(cross_val_score(svm_clf_spect,selected_feat_df_spect,y_spect,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_spect,selected_feat_df_spect,y_spect,cv=10,scoring='f1')))

features selected:  6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8154558404558404
0.880510805357041


# Breast Cancer

In [57]:
df_wcancer = pd.read_csv(os.path.join(PATH,"breast_cancer/breast_cancer_data.csv"))
df_wcancer = df_wcancer.drop(['id','Unnamed: 32'],axis=1)
y_wcancer = df_wcancer['diagnosis']
le_wcancer = LabelEncoder()
y_wcancer = le_wcancer.fit_transform(y_wcancer)
df_wcancer = df_wcancer.drop(['diagnosis'],axis=1)
X_train_wcancer,X_test_wcancer,y_train_wcancer,y_test_wcancer = train_test_split(df_wcancer,y_wcancer,random_state=42)

In [58]:
dt_clf_wcancer = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_wcancer,df_wcancer,y_wcancer,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_wcancer,df_wcancer,y_wcancer,cv=10,scoring='f1')))

0.9281220292109584
0.9037729964676375


In [59]:
svm_clf_wcancer = SVC(random_state=42,gamma='scale')
print(np.mean(cross_val_score(svm_clf_wcancer,df_wcancer,y_wcancer,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_wcancer,df_wcancer,y_wcancer,cv=10,scoring='f1')))

0.9367999740731138
0.9135985288424313


In [31]:
dt_clf_wcancer.fit(X_train_wcancer,y_train_wcancer)
sel_wcancer = SelectFromModel(dt_clf_wcancer,prefit=True)
# sel_wcancer.fit(X_train_wcancer,y_train_wcancer)
print("features selected: ",sum(sel_wcancer.get_support()))
selected_feat_df_wcancer = df_wcancer.loc[:,sel_wcancer.get_support()]
# dt_clf_wcancer.fit(X_train_wcancer,y_train_wcancer)
selected_feat_df_wcancer['dt_output']=dt_clf_wcancer.predict(df_wcancer)
print(np.mean(cross_val_score(svm_clf_wcancer,selected_feat_df_wcancer,y_wcancer,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_wcancer,selected_feat_df_wcancer,y_wcancer,cv=10,scoring='f1')))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9158996629504796
0.8785589442649776


# Statlog Heart 

In [33]:
df_heart = pd.read_csv(os.path.join(PATH,"statlog_heart/heart.dat"),header=None,sep=" ")
y_heart = df_heart[13]
y_heart = y_heart.map({1:0,2:1})
df_heart = df_heart.drop([13],axis=1)
X_train_heart,X_test_heart,y_train_heart,y_test_heart = train_test_split(df_heart,y_heart,random_state=42)

In [34]:
dt_clf_heart = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_heart,df_heart,y_heart,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_heart,df_heart,y_heart,cv=10,scoring='f1')))

0.7444444444444444
0.7014562405134488


In [35]:
svm_clf_heart = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_heart,df_heart,y_heart,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_heart,df_heart,y_heart,cv=10,scoring='f1')))

0.6814814814814815
0.6180819582993495


In [36]:
dt_clf_heart.fit(X_train_heart,y_train_heart)
sel_heart = SelectFromModel(dt_clf_heart,prefit=True)
# sel_heart.fit(df_heart,y_heart)
print("features selected: ",sum(sel_heart.get_support()))
selected_feat_df_heart = df_heart.loc[:,sel_heart.get_support()]
# dt_clf_heart.fit(X_train_heart,y_train_heart)
selected_feat_df_heart['dt_output']=dt_clf_heart.predict(df_heart)
print(np.mean(cross_val_score(svm_clf_heart,selected_feat_df_heart,y_heart,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_heart,selected_feat_df_heart,y_heart,cv=10,scoring='f1')))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.6703703703703704
0.594139449414049


# Hepatitis

In [38]:
df_hepat = pd.read_csv(os.path.join(PATH,"hepatitis/hepatitis.data"),header=None,sep=",")
df_hepat = df_hepat.replace('?',np.NAN)
df_hepat = df_hepat.dropna(axis=0)
df_hepat = df_hepat.reset_index(drop=True)
df_hepat = pd.get_dummies(df_hepat)

y_hepat = df_hepat[0]
df_hepat = df_hepat.drop([0],axis=1)
y_hepat = y_hepat.map({1:0,2:1})
X_train_hepat,X_test_hepat,y_train_hepat,y_test_hepat = train_test_split(df_hepat,y_hepat,random_state=42)

In [39]:
dt_clf_hepat = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_hepat,df_hepat,y_hepat,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_hepat,df_hepat,y_hepat,cv=10,scoring='f1')))

0.7089285714285715
0.8096536796536796


In [40]:
svm_clf_hepat = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_hepat,df_hepat,y_hepat,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_hepat,df_hepat,y_hepat,cv=10,scoring='f1')))

0.8404761904761904
0.9127564102564103


In [41]:
dt_clf_hepat.fit(X_train_hepat,y_train_hepat)
sel_hepat = SelectFromModel(dt_clf_hepat,prefit=True)
# sel_hepat.fit(df_hepat,y_hepat)
print("features selected: ",sum(sel_hepat.get_support()))
selected_feat_df_hepat = df_hepat.loc[:,sel_hepat.get_support()]
# dt_clf_hepat.fit(X_train_hepat,y_train_hepat)
selected_feat_df_hepat['dt_output']=dt_clf_hepat.predict(df_hepat)
print(np.mean(cross_val_score(svm_clf_hepat,selected_feat_df_hepat,y_hepat,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_hepat,selected_feat_df_hepat,y_hepat,cv=10,scoring='f1')))

features selected:  7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8404761904761904
0.9127564102564103


# HypoThyroid

In [43]:
df_hypo = pd.read_csv(os.path.join(PATH,"hypothyroid/hypothyroid.data"),header=None)

df_hypo = df_hypo.replace('?',np.NAN)
df_hypo = df_hypo.drop([25],axis=1)
df_hypo = df_hypo.dropna(axis=0)
df_hypo = df_hypo.reset_index(drop=True)

y_hypo = df_hypo[0]
y_hypo = y_hypo.map({'hypothyroid':0,'negative':1})
df_hypo = df_hypo.drop([0],axis=1)
df_hypo = pd.get_dummies(df_hypo)
X_train_hypo,X_test_hypo,y_train_hypo,y_test_hypo = train_test_split(df_hypo,y_hypo,random_state=42)

In [44]:
dt_clf_hypo = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_hypo,df_hypo,y_hypo,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_hypo,df_hypo,y_hypo,cv=10,scoring='f1')))

0.9445020125503139
0.9705242559667727


In [45]:
svm_clf_hypo = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_hypo,df_hypo,y_hypo,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_hypo,df_hypo,y_hypo,cv=10,scoring='f1')))

0.9390043751093776
0.9685418777615119


In [46]:
dt_clf_hypo.fit(X_train_hypo,y_train_hypo)
sel_hypo = SelectFromModel(dt_clf_hypo,prefit=True)
# sel_hypo.fit(df_hypo,y_hypo)
print("features selected: ",sum(sel_hypo.get_support()))
selected_feat_df_hypo = df_hypo.loc[:,sel_hypo.get_support()]
# dt_clf_hypo.fit(X_train_hypo,y_train_hypo)
selected_feat_df_hypo['dt_output']=dt_clf_hypo.predict(df_hypo)
print(np.mean(cross_val_score(svm_clf_hypo,selected_feat_df_hypo,y_hypo,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_hypo,selected_feat_df_hypo,y_hypo,cv=10,scoring='f1')))

features selected:  72


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9900074376859422
0.9946821344699549


# Hungarian-14-Heart

In [60]:
df_hung = pd.read_csv(os.path.join(PATH,"hungarian_heart/hungarian.data"),header=None)

df_hung = df_hung.replace('?',np.NAN)
df_hung = df_hung.drop([10,11,12],axis=1)
df_hung = df_hung.dropna(axis=0)
df_hung = df_hung.reset_index(drop=True)

y_hung = df_hung[13]
df_hung = df_hung.drop([13],axis=1)
df_hung = pd.get_dummies(df_hung)
X_train_hung,X_test_hung,y_train_hung,y_test_hung = train_test_split(df_hung,y_hung,random_state=42)

In [61]:
dt_clf_hung = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_hung,df_hung,y_hung,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_hung,df_hung,y_hung,cv=10,scoring='f1')))

0.699925925925926
0.6130465587044533


In [65]:
svm_clf_hung = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_hung,df_hung,y_hung,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_hung,df_hung,y_hung,cv=10,scoring='f1_micro')))

0.6284273504273503
0.6284273504273503


In [66]:
dt_clf_hung.fit(X_train_hung,y_train_hung)
sel_hung = SelectFromModel(dt_clf_hung,prefit=True)
# sel_hung.fit(df_hung,y_hung)
print("features selected: ",sum(sel_hung.get_support()))
selected_feat_df_hung = df_hung.loc[:,sel_hung.get_support()]
# dt_clf_hung.fit(X_train_hung,y_train_hung)
selected_feat_df_hung['dt_output']=dt_clf_hung.predict(df_hung)
print(np.mean(cross_val_score(svm_clf_hung,selected_feat_df_hung,y_hung,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_hung,selected_feat_df_hung,y_hung,cv=10,scoring='f1')))

features selected:  32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9425584045584046
0.9250543024227236


# Ionosphere

In [68]:
df_ion = pd.read_csv(os.path.join(PATH,"ionosphere/ionosphere.data"),header=None,sep=',')
y_ion = df_ion[34]
df_ion = df_ion.drop([34],axis=1)
y_ion = y_ion.map({'g':0,'b':1})
X_train_ion,X_test_ion,y_train_ion,y_test_ion = train_test_split(df_ion,y_ion,random_state=42)

In [69]:
dt_clf_ion = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_ion,df_ion,y_ion,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_ion,df_ion,y_ion,cv=10,scoring='f1')))

0.8647152194211017
0.803930213104126


In [71]:
svm_clf_ion = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_ion,df_ion,y_ion,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_ion,df_ion,y_ion,cv=10,scoring='f1')))

0.9406115779645191
0.9104268774703558


In [72]:
dt_clf_ion.fit(X_train_ion,y_train_ion)
sel_ion = SelectFromModel(dt_clf_ion,prefit=True)
# sel_ion.fit(df_ion,y_ion)
print("features selected: ",sum(sel_ion.get_support()))
selected_feat_df_ion = df_ion.loc[:,sel_ion.get_support()]
# dt_clf_ion.fit(X_train_ion,y_train_ion)
selected_feat_df_ion['dt_output']=dt_clf_ion.predict(df_ion)
print(np.mean(cross_val_score(svm_clf_ion,selected_feat_df_ion,y_ion,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_ion,selected_feat_df_ion,y_ion,cv=10,scoring='f1')))

features selected:  6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9776143790849673
0.9677621348056131


# Sonar

In [74]:
df_sonar = pd.read_csv(os.path.join(PATH,"sonar/sonar.all-data"),header=None)
y_sonar = df_sonar[60]
df_sonar = df_sonar.drop([60],axis=1)
y_sonar = y_sonar.map({'R':0,'M':1})
X_train_sonar,X_test_sonar,y_train_sonar,y_test_sonar = train_test_split(df_sonar,y_sonar,random_state=42)

In [75]:
dt_clf_sonar = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_sonar,df_sonar,y_sonar,cv=20,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_sonar,df_sonar,y_sonar,cv=20,scoring='f1')))

0.6531313131313131
0.6653138528138528


In [76]:
svm_clf_sonar = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_sonar,df_sonar,y_sonar,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_sonar,df_sonar,y_sonar,cv=10,scoring='f1')))

0.6161255411255411
0.6493208360267184


In [77]:
dt_clf_sonar.fit(X_train_sonar,y_train_sonar)
sel_sonar = SelectFromModel(dt_clf_sonar,prefit=True)
# sel_sonar.fit(df_sonar,y_sonar)
print("features selected: ",sum(sel_sonar.get_support()))
selected_feat_df_sonar = df_sonar.loc[:,sel_sonar.get_support()]
# dt_clf_sonar.fit(X_train_sonar,y_train_sonar)
selected_feat_df_sonar['dt_output']=dt_clf_sonar.predict(df_sonar)
print(np.mean(cross_val_score(svm_clf_sonar,selected_feat_df_sonar,y_sonar,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_sonar,selected_feat_df_sonar,y_sonar,cv=10,scoring='f1')))

features selected:  14
0.9285281385281385


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9309796725014117


# Vote

In [79]:
df_vote = pd.read_csv(os.path.join(PATH,"vote/house-votes-84.data"),header=None)

df_vote = df_vote.replace('?',np.NAN)
df_vote = df_vote.dropna(axis=0)
df_vote = df_vote.reset_index(drop=True)

y_vote = df_vote[0]
df_vote = df_vote.drop([0],axis=1)
y_vote = y_vote.map({'democrat':0,'republican':1})

df_vote = pd.get_dummies(df_vote)
X_train_vote,X_test_vote,y_train_vote,y_test_vote = train_test_split(df_vote,y_vote,random_state=42)

In [80]:
dt_clf_vote = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_vote,df_vote,y_vote,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_vote,df_vote,y_vote,cv=10,scoring='f1')))

0.9347002635046113
0.9289103192764522


In [81]:
svm_clf_vote = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_vote,df_vote,y_vote,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_vote,df_vote,y_vote,cv=10,scoring='f1')))

0.96933465085639
0.9687041219649914


In [82]:
dt_clf_vote.fit(X_train_vote,y_train_vote)
sel_vote = SelectFromModel(dt_clf_vote,prefit=True)
# sel_vote.fit(df_vote,y_vote)
print("features selected: ",sum(sel_vote.get_support()))
selected_feat_df_vote = df_vote.loc[:,sel_vote.get_support()]
# dt_clf_vote.fit(X_train_vote,y_train_vote)
selected_feat_df_vote['dt_output']=dt_clf_vote.predict(df_vote)
print(np.mean(cross_val_score(svm_clf_vote,selected_feat_df_vote,y_vote,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_vote,selected_feat_df_vote,y_vote,cv=10,scoring='f1')))

features selected:  2
0.9827898550724639


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.980952380952381


# Kr vs Kp

In [84]:
df_krkp = pd.read_csv(os.path.join(PATH,"krvskp/kr-vs-kp.data"),header=None)

y_krkp = df_krkp[36]
y_krkp = y_krkp.map({'won':0,'nowin':1})
df_krkp = df_krkp.drop([36],axis=1)

df_krkp = pd.get_dummies(df_krkp)
X_train_krkp,X_test_krkp,y_train_krkp,y_test_krkp = train_test_split(df_krkp,y_krkp,random_state=42)

In [85]:
dt_clf_krkp = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_krkp,df_krkp,y_krkp,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_krkp,df_krkp,y_krkp,cv=10,scoring='f1')))

0.9783835898838745
0.9761502600552785


In [86]:
svm_clf_krkp = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_krkp,df_krkp,y_krkp,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_krkp,df_krkp,y_krkp,cv=10,scoring='f1')))

0.9258444295755209
0.9172402992518884


In [87]:
dt_clf_krkp.fit(X_train_krkp,y_train_krkp)
sel_krkp = SelectFromModel(dt_clf_krkp,prefit=True)
# sel_krkp.fit(df_krkp,y_krkp)
print("features selected: ",sum(sel_krkp.get_support()))
selected_feat_df_krkp = df_krkp.loc[:,sel_krkp.get_support()]
# dt_clf_krkp.fit(X_train_krkp,y_train_krkp)
selected_feat_df_krkp['dt_output']=dt_clf_krkp.predict(df_krkp)
print(np.mean(cross_val_score(svm_clf_krkp,selected_feat_df_krkp,y_krkp,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_krkp,selected_feat_df_krkp,y_krkp,cv=10,scoring='f1')))

features selected:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9971796506870921
0.9970274814438115


# Thyroid-Sick

In [89]:
sick = fetch_datasets()['thyroid_sick']
df_sick = pd.DataFrame(sick.data)
y_sick = sick.target
y_sick = pd.Series(y_sick).map({-1:0,1:1})
X_train_sick,X_test_sick,y_train_sick,y_test_sick = train_test_split(df_sick,y_sick,random_state=42)

In [90]:
dt_clf_sick = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_sick,df_sick,y_sick,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_sick,df_sick,y_sick,cv=10,scoring='f1')))

0.9870054520131856
0.8962754210534044


In [92]:
svm_clf_sick = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_sick,df_sick,y_sick,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_sick,df_sick,y_sick,cv=10,scoring='f1')))

0.9387603843704291
0.9387603843704291


In [94]:
dt_clf_sick.fit(X_train_sick,y_train_sick)
sel_sick = SelectFromModel(dt_clf_sick,prefit=True)
# sel_sick.fit(df_sick,y_sick)
print("features selected: ",sum(sel_sick.get_support()))
selected_feat_df_sick = df_sick.loc[:,sel_sick.get_support()]
# dt_clf_sick.fit(X_train_sick,y_train_sick)
selected_feat_df_sick['dt_output']=dt_clf_sick.predict(df_sick)
print(np.mean(cross_val_score(svm_clf_sick,selected_feat_df_sick,y_sick,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_sick,selected_feat_df_sick,y_sick,cv=10,scoring='f1')))

features selected:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9411448527816466
0.9411448527816466


# Monks1

In [96]:
df_monks1 = pd.DataFrame()
df_monks1_train = pd.read_csv(os.path.join(PATH,"monks/monks-1.train"),header=None,sep=" ")
df_monks1_test = pd.read_csv(os.path.join(PATH,"monks/monks-1.test"),header=None,sep=" ")

df_monks1 = df_monks1.append(df_monks1_train,ignore_index=True)
df_monks1 = df_monks1.append(df_monks1_test,ignore_index=True)

y_monks1 = df_monks1[1]
df_monks1 = df_monks1.drop([0,1,8],axis=1)

X_train_monks1,X_test_monks1,y_train_monks1,y_test_monks1 = train_test_split(df_monks1,y_monks1,random_state=42)

In [97]:
dt_clf_monks1 = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_monks1,df_monks1,y_monks1,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_monks1,df_monks1,y_monks1,cv=10,scoring='f1')))

1.0
1.0


In [98]:
svm_clf_monks1 = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_monks1,df_monks1,y_monks1,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_monks1,df_monks1,y_monks1,cv=10,scoring='f1')))

0.8850529100529101
0.8541546285846447


In [99]:
dt_clf_monks1.fit(X_train_monks1,y_train_monks1)
sel_monks1 = SelectFromModel(dt_clf_monks1,prefit=True)
# sel_monks1.fit(df_monks1,y_monks1)
print("features selected: ",sum(sel_monks1.get_support()))
selected_feat_df_monks1 = df_monks1.loc[:,sel_monks1.get_support()]
# dt_clf_monks1.fit(X_train_monks1,y_train_monks1)
selected_feat_df_monks1['dt_output']=dt_clf_monks1.predict(df_monks1)
print(np.mean(cross_val_score(svm_clf_monks1,selected_feat_df_monks1,y_monks1,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_monks1,selected_feat_df_monks1,y_monks1,cv=10,scoring='f1')))

features selected:  3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9871693121693121
0.9869209010122615


# Monks2

In [101]:
df_monks2 = pd.DataFrame()
df_monks2_train = pd.read_csv(os.path.join(PATH,"monks/monks-2.train"),header=None,sep=" ")
df_monks2_test = pd.read_csv(os.path.join(PATH,"monks/monks-2.test"),header=None,sep=" ")

df_monks2 = df_monks2.append(df_monks2_train,ignore_index=True)
df_monks2 = df_monks2.append(df_monks2_test,ignore_index=True)

y_monks2 = df_monks2[1]
df_monks2 = df_monks2.drop([0,1,8],axis=1)

X_train_monks2,X_test_monks2,y_train_monks2,y_test_monks2 = train_test_split(df_monks2,y_monks2,random_state=42)

In [102]:
dt_clf_monks2 = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_monks2,df_monks2,y_monks2,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_monks2,df_monks2,y_monks2,cv=10,scoring='f1')))

0.9664925442252477
0.9510059975277366


In [106]:
svm_clf_monks2 = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_monks2,df_monks2,y_monks2,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_monks2,df_monks2,y_monks2,cv=10,scoring='f1')))

0.6506362878577383
0.6506362878577383


In [107]:
dt_clf_monks2.fit(X_train_monks2,y_train_monks2)
sel_monks2 = SelectFromModel(dt_clf_monks2,prefit=True)
# sel_monks2.fit(df_monks2,y_monks2)
print("features selected: ",sum(sel_monks2.get_support()))
selected_feat_df_monks2 = df_monks2.loc[:,sel_monks2.get_support()]
# dt_clf_monks2.fit(X_train_monks2,y_train_monks2)
selected_feat_df_monks2['dt_output']=dt_clf_monks2.predict(df_monks2)
print(np.mean(cross_val_score(svm_clf_monks2,selected_feat_df_monks2,y_monks2,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_monks2,selected_feat_df_monks2,y_monks2,cv=10,scoring='f1')))

features selected:  3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9916092433083264
0.9876681646959582


# Monks3

In [109]:
df_monks3 = pd.DataFrame()
df_monks3_train = pd.read_csv(os.path.join(PATH,"monks/monks-3.train"),header=None,sep=" ")
df_monks3_test = pd.read_csv(os.path.join(PATH,"monks/monks-3.test"),header=None,sep=" ")

df_monks3 = df_monks3.append(df_monks3_train,ignore_index=True)
df_monks3 = df_monks3.append(df_monks3_test,ignore_index=True)

y_monks3 = df_monks3[1]
df_monks3 = df_monks3.drop([0,1,8],axis=1)

X_train_monks3,X_test_monks3,y_train_monks3,y_test_monks3 = train_test_split(df_monks3,y_monks3,random_state=42)

In [110]:
dt_clf_monks3 = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_monks3,df_monks3,y_monks3,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_monks3,df_monks3,y_monks3,cv=10,scoring='f1')))

0.9638924963924964
0.9639681784677471


In [111]:
svm_clf_monks3 = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_monks3,df_monks3,y_monks3,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_monks3,df_monks3,y_monks3,cv=10,scoring='f1')))

0.9638912938912938
0.963921448921449


In [112]:
dt_clf_monks3.fit(X_train_monks3,y_train_monks3)
sel_monks3 = SelectFromModel(dt_clf_monks3,prefit=True)
# sel_monks2.fit(df_monks2,y_monks2)
print("features selected: ",sum(sel_monks3.get_support()))
selected_feat_df_monks3 = df_monks3.loc[:,sel_monks3.get_support()]
# dt_clf_monks2.fit(X_train_monks2,y_train_monks2)
selected_feat_df_monks3['dt_output']=dt_clf_monks3.predict(df_monks3)
print(np.mean(cross_val_score(svm_clf_monks3,selected_feat_df_monks3,y_monks3,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_monks3,selected_feat_df_monks3,y_monks3,cv=10,scoring='f1')))

features selected:  2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9874350649350649
0.987712926383756


# Mushroom

In [114]:
df_mushroom = pd.read_csv(os.path.join(PATH,"mushroom/mushrooms.csv"))
y_mushroom = df_mushroom['class']
df_mushroom = df_mushroom.drop(['class'],axis=1)
df_mushroom = pd.get_dummies(df_mushroom)
y_mushroom = y_mushroom.map({'p':0,'e':1})
X_train_mushroom,X_test_mushroom,y_train_mushroom,y_test_mushroom = train_test_split(df_mushroom,y_mushroom,random_state=42)

In [115]:
dt_clf_mushroom = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_mushroom,df_mushroom,y_mushroom,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_mushroom,df_mushroom,y_mushroom,cv=10,scoring='f1')))

0.9564284446790214
0.9635446889718182


In [116]:
svm_clf_mushroom = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_mushroom,df_mushroom,y_mushroom,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_mushroom,df_mushroom,y_mushroom,cv=10,scoring='f1')))

0.9438531993211423
0.9461093424144631


In [117]:
dt_clf_mushroom.fit(X_train_mushroom,y_train_mushroom)
sel_mushroom = SelectFromModel(dt_clf_mushroom,prefit=True)
# sel_mushroom.fit(df_mushroom,y_mushroom)
print("features selected: ",sum(sel_mushroom.get_support()))
selected_feat_df_mushroom = df_mushroom.loc[:,sel_mushroom.get_support()]
# dt_clf_mushroom.fit(X_train_mushroom,y_train_mushroom)
selected_feat_df_mushroom['dt_output']=dt_clf_mushroom.predict(df_mushroom)
print(np.mean(cross_val_score(svm_clf_mushroom,selected_feat_df_mushroom,y_mushroom,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_mushroom,selected_feat_df_mushroom,y_mushroom,cv=10,scoring='f1')))

features selected:  7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


1.0
1.0


# Musk1

In [118]:
df_musk1 = pd.read_csv(os.path.join(PATH,"musk1/clean1.data"),header=None)
y_musk1 = df_musk1[168]
df_musk1 = df_musk1.drop([0,1,168],axis=1)
X_train_musk1,X_test_musk1,y_train_musk1,y_test_musk1 = train_test_split(df_musk1,y_musk1,random_state=42)

In [119]:
dt_clf_musk1 = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_musk1,df_musk1,y_musk1,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_musk1,df_musk1,y_musk1,cv=10,scoring='f1')))

0.7131109312365094
0.6441861746907149


In [121]:
svm_clf_musk1 = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_musk1,df_musk1,y_musk1,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_musk1,df_musk1,y_musk1,cv=10,scoring='f1')))

0.569332022818378
0.569332022818378


In [123]:
dt_clf_musk1.fit(X_train_musk1,y_train_musk1)
sel_musk1= SelectFromModel(dt_clf_musk1,prefit=True)
# sel_musk1.fit(df_musk1,y_musk1)
print("features selected: ",sum(sel_musk1.get_support()))
selected_feat_df_musk1 = df_musk1.loc[:,sel_musk1.get_support()]
# dt_clf_musk1.fit(X_train_musk1,y_train_musk1)
selected_feat_df_musk1['dt_output']=dt_clf_musk1.predict(df_musk1)
print(np.mean(cross_val_score(svm_clf_musk1,selected_feat_df_musk1,y_musk1,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_musk1,selected_feat_df_musk1,y_musk1,cv=10,scoring='f1')))

features selected:  29


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.5946904872032068
0.5946904872032068


# Musk2

In [125]:
df_musk2 = pd.read_csv(os.path.join(PATH,"musk2/clean2.data"),header=None)
y_musk2 = df_musk2[168]
df_musk2 = df_musk2.drop([0,1,168],axis=1)
X_train_musk2,X_test_musk2,y_train_musk2,y_test_musk2 = train_test_split(df_musk2,y_musk2,random_state=42)

In [126]:
dt_clf_musk2 = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_musk2,df_musk2,y_musk2,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_musk2,df_musk2,y_musk2,cv=10,scoring='f1')))

0.7806383091287671
0.5885950132869692


In [130]:
svm_clf_musk2 = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_musk2,df_musk2,y_musk2,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_musk2,df_musk2,y_musk2,cv=10,scoring='f1_micro')))

0.8508616605750896
0.8508616605750896


In [131]:
dt_clf_musk2.fit(X_train_musk2,y_train_musk2)
sel_musk2= SelectFromModel(dt_clf_musk2,prefit=True)
# sel_musk2.fit(df_musk2,y_musk2)
print("features selected: ",sum(sel_musk2.get_support()))
selected_feat_df_musk2 = df_musk2.loc[:,sel_musk2.get_support()]
# dt_clf_musk2.fit(X_train_musk2,y_train_musk2)
selected_feat_df_musk2['dt_output']=dt_clf_musk2.predict(df_musk2)
print(np.mean(cross_val_score(svm_clf_musk2,selected_feat_df_musk2,y_musk2,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_musk2,selected_feat_df_musk2,y_musk2,cv=10,scoring='f1_micro')))

features selected:  37


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8537402261894055
0.8537402261894054


# Ozone

In [132]:
df_ozone = pd.read_csv(os.path.join(PATH,"ozone/ozone.data"),header=None)

df_ozone = df_ozone.replace('?',np.NAN)
df_ozone = df_ozone.dropna()

y_ozone = df_ozone[73]
df_ozone = df_ozone.drop([0,73],axis=1)

X_train_ozone,X_test_ozone,y_train_ozone,y_test_ozone = train_test_split(df_ozone,y_ozone,random_state=42)

In [135]:
dt_clf_ozone = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_ozone,df_ozone,y_ozone,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_ozone,df_ozone,y_ozone,cv=10,scoring='f1_micro')))

0.892270215689674
0.892270215689674


In [137]:
svm_clf_ozone = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_ozone,df_ozone,y_ozone,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_ozone,df_ozone,y_ozone,cv=10,scoring='f1_micro')))

0.9307046676041688
0.9307046676041688


In [140]:
dt_clf_ozone.fit(X_train_ozone,y_train_ozone)
sel_ozone= SelectFromModel(dt_clf_ozone,prefit=True)
# sel_ozone.fit(df_ozone,y_ozone)
print("features selected: ",sum(sel_ozone.get_support()))
selected_feat_df_ozone = df_ozone.loc[:,sel_ozone.get_support()]
# dt_clf_ozone.fit(X_train_ozone,y_train_ozone)
selected_feat_df_ozone['dt_output']=dt_clf_ozone.predict(df_ozone)
print(np.mean(cross_val_score(svm_clf_ozone,selected_feat_df_ozone,y_ozone,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_ozone,selected_feat_df_ozone,y_ozone,cv=10,scoring='f1_micro')))

features selected:  24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9307046676041688
0.9307046676041688


# Parkinsons

In [141]:
df_parkinsons = pd.read_csv(os.path.join(PATH,"parkinsons/parkinsons.data"))

y_parkinsons = df_parkinsons['status']
df_parkinsons = df_parkinsons.drop(['name','status'],axis=1)

X_train_parkinsons,X_test_parkinsons,y_train_parkinsons,y_test_parkinsons = train_test_split(df_parkinsons,y_parkinsons,random_state=42)

In [142]:
dt_clf_parkinsons = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_parkinsons,df_parkinsons,y_parkinsons,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_parkinsons,df_parkinsons,y_parkinsons,cv=10,scoring='f1')))

0.8040058479532164
0.8666767188749948


In [143]:
svm_clf_parkinsons = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_parkinsons,df_parkinsons,y_parkinsons,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_parkinsons,df_parkinsons,y_parkinsons,cv=10,scoring='f1')))

0.7637134502923976
0.8546570258927945


In [144]:
dt_clf_parkinsons.fit(X_train_parkinsons,y_train_parkinsons)
sel_parkinsons= SelectFromModel(dt_clf_parkinsons,prefit=True)
# sel_parkinsons.fit(df_parkinsons,y_parkinsons)
print("features selected: ",sum(sel_parkinsons.get_support()))
selected_feat_df_parkinsons = df_parkinsons.loc[:,sel_parkinsons.get_support()]
# dt_clf_parkinsons.fit(X_train_parkinsons,y_train_parkinsons)
selected_feat_df_parkinsons['dt_output']=dt_clf_parkinsons.predict(df_parkinsons)
print(np.mean(cross_val_score(svm_clf_parkinsons,selected_feat_df_parkinsons,y_parkinsons,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_parkinsons,selected_feat_df_parkinsons,y_parkinsons,cv=10,scoring='f1')))

features selected:  6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8236842105263158
0.8957600337131515


# Planning

In [145]:
df_planning = pd.read_csv(os.path.join(PATH,"planning/plrx.txt"),sep='\t',header=None)

y_planning = df_planning[12]
df_planning = df_planning.drop([12,13],axis=1)
y_planning = y_planning.map({1.0:0,2.0:1})

X_train_planning,X_test_planning,y_train_planning,y_test_planning = train_test_split(df_planning,y_planning,random_state=42)

In [150]:
dt_clf_planning = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_planning,df_planning,y_planning,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_planning,df_planning,y_planning,cv=10,scoring='f1')))

0.5336257309941521
0.5336257309941521


In [154]:
svm_clf_planning = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_planning,df_planning,y_planning,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_planning,df_planning,y_planning,cv=10,scoring='f1')))

0.7146198830409357
0.7146198830409357


In [155]:
dt_clf_planning.fit(X_train_planning,y_train_planning)
sel_planning= SelectFromModel(dt_clf_planning,prefit=True)
# sel_planning.fit(df_planning,y_planning)
print("features selected: ",sum(sel_planning.get_support()))
selected_feat_df_planning = df_planning.loc[:,sel_planning.get_support()]
# dt_clf_planning.fit(X_train_planning,y_train_planning)
selected_feat_df_planning['dt_output']=dt_clf_planning.predict(df_planning)
print(np.mean(cross_val_score(svm_clf_planning,selected_feat_df_planning,y_planning,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_planning,selected_feat_df_planning,y_planning,cv=10,scoring='f1')))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.87953216374269
0.7795998445998445


# Ringnorm

In [157]:
df_ringnorm = pd.read_csv(os.path.join(PATH,"ringnorm/ringnorm.csv"))

y_ringnorm = df_ringnorm['Class']
df_ringnorm = df_ringnorm.drop(['Class'],axis=1)
y_ringnorm = y_ringnorm.map({1:0,2:1})

X_train_ringnorm,X_test_ringnorm,y_train_ringnorm,y_test_ringnorm = train_test_split(df_ringnorm,y_ringnorm,random_state=42)

In [158]:
dt_clf_ringnorm = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_ringnorm,df_ringnorm,y_ringnorm,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_ringnorm,df_ringnorm,y_ringnorm,cv=10,scoring='f1')))

0.8759433910580553
0.8790831806992138


In [162]:
svm_clf_ringnorm = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_ringnorm,df_ringnorm,y_ringnorm,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_ringnorm,df_ringnorm,y_ringnorm,cv=10,scoring='f1')))

KeyboardInterrupt: 

In [160]:
dt_clf_ringnorm.fit(X_train_ringnorm,y_train_ringnorm)
sel_ringnorm= SelectFromModel(dt_clf_ringnorm,prefit=True)
# sel_ringnorm.fit(df_ringnorm,y_ringnorm)
print("features selected: ",sum(sel_ringnorm.get_support()))
selected_feat_df_ringnorm = df_ringnorm.loc[:,sel_ringnorm.get_support()]
# dt_clf_ringnorm.fit(X_train_ringnorm,y_train_ringnorm)
selected_feat_df_ringnorm['dt_output']=dt_clf_ringnorm.predict(df_ringnorm)
print(np.mean(cross_val_score(svm_clf_ringnorm,selected_feat_df_ringnorm,y_ringnorm,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_ringnorm,selected_feat_df_ringnorm,y_ringnorm,cv=10,scoring='f1')))

features selected:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.504864871774636
0.6709769747219991


# Spambase

In [None]:
df_spam = pd.read_csv(os.path.join(PATH,"spambase/spambase.data"),header=None)

y_spam = df_spam[57]
df_spam = df_spam.drop([57],axis=1)

X_train_spam,X_test_spam,y_train_spam,y_test_spam = train_test_split(df_spam,y_spam,random_state=42)

In [None]:
dt_clf_spam = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_spam,df_spam,y_spam,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_spam,df_spam,y_spam,cv=10,scoring='f1')))

In [None]:
svm_clf_spam = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_spam,df_spam,y_spam,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_spam,df_spam,y_spam,cv=10,scoring='f1')))

In [103]:
dt_clf_spam.fit(X_train_spam,y_train_spam)
sel_spam = SelectFromModel(dt_clf_spam,prefit=True)
# sel_spam.fit(df_spam,y_spam)
print("features selected: ",sum(sel_spam.get_support()))
selected_feat_df_spam = df_spam.loc[:,sel_spam.get_support()]
# dt_clf_spam.fit(X_train_spam,y_train_spam)
selected_feat_df_spam['dt_output']=dt_clf_spam.predict(df_spam)
np.mean(cross_val_score(svm_clf_spam,selected_feat_df_spam,y_spam,cv=10,scoring='accuracy'))

features selected:  11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.7876648272563136

# Australian Credit

In [164]:
df_auscred = pd.read_csv(os.path.join(PATH,"australian-credit/australian.dat"),header=None,sep=" ")

y_auscred = df_auscred[14]
df_auscred = df_auscred.drop([14],axis=1)

X_train_auscred,X_test_auscred,y_train_auscred,y_test_auscred = train_test_split(df_auscred,y_auscred,random_state=42)

In [165]:
dt_clf_auscred = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_auscred,df_auscred,y_auscred,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_auscred,df_auscred,y_auscred,cv=10,scoring='f1')))

0.8070539520155888
0.7837576306951084


In [166]:
svm_clf_auscred = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_auscred,df_auscred,y_auscred,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_auscred,df_auscred,y_auscred,cv=10,scoring='f1')))

0.6796078431372549
0.5834503478442151


In [167]:
dt_clf_auscred.fit(X_train_auscred,y_train_auscred)
sel_auscred = SelectFromModel(dt_clf_auscred,prefit=True)
# sel_auscred.fit(df_auscred,y_auscred)
print("features selected: ",sum(sel_auscred.get_support()))
selected_feat_df_auscred = df_auscred.loc[:,sel_auscred.get_support()]
# selected_feat_df_auscred = df_auscred
# dt_clf_auscred.fit(X_train_auscred,y_train_auscred)
selected_feat_df_auscred['dt_output']=dt_clf_auscred.predict(df_auscred)
print(np.mean(cross_val_score(svm_clf_auscred,selected_feat_df_auscred,y_auscred,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_auscred,selected_feat_df_auscred,y_auscred,cv=10,scoring='f1')))

features selected:  2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.9463080014614542
0.9394157177578899


# German credit

In [169]:
df_german = pd.read_csv(os.path.join(PATH,'german-credit/german_credit_data.csv'))

df_german = df_german.dropna()
y_german = df_german['Risk']
df_german = df_german.drop(['Unnamed: 0','Risk'],axis=1)
df_german = pd.get_dummies(df_german)
y_german = y_german.map({'bad':1,'good':0})
X_train_german,X_test_german,y_train_german,y_test_german = train_test_split(df_german,y_german,random_state=42)

In [170]:
dt_clf_german = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_german,df_german,y_german,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_german,df_german,y_german,cv=10,scoring='f1')))

0.5816239316239317
0.5205523078796737


In [171]:
svm_clf_german = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_german,df_german,y_german,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_german,df_german,y_german,cv=10,scoring='f1')))

0.5898148148148148
0.46755866667669127


In [172]:
dt_clf_german.fit(X_train_german,y_train_german)
sel_german = SelectFromModel(dt_clf_german,prefit=True)
# sel_german.fit(df_german,y_german)
print("features selected: ",sum(sel_german.get_support()))
selected_feat_df_german = df_german.loc[:,sel_german.get_support()]
# selected_feat_df_german = df_german
# dt_clf_german.fit(X_train_german,y_train_german)
selected_feat_df_german['dt_output']=dt_clf_german.predict(df_german)
print(np.mean(cross_val_score(svm_clf_german,selected_feat_df_german,y_german,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_german,selected_feat_df_german,y_german,cv=10,scoring='f1')))

features selected:  5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.5841168091168092
0.47310897025254955


# Tic-Tac-Toe

In [174]:
df_tictac = pd.read_csv(os.path.join(PATH,"tictactoe/tic-tac-toe.data"),header=None)

y_tictac = df_tictac[9]
y_tictac = y_tictac.map({'positive':0,'negative':1})
df_tictac = df_tictac.drop([9],axis=1)

df_tictac = pd.get_dummies(df_tictac)
X_train_tictac,X_test_tictac,y_train_tictac,y_test_tictac = train_test_split(df_tictac,y_tictac,random_state=42)

In [175]:
dt_clf_tictac  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_tictac,df_tictac,y_tictac,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_tictac,df_tictac,y_tictac,cv=10,scoring='f1')))

0.811394239464641
0.7660120247143699


In [176]:
svm_clf_tictac = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_tictac,df_tictac,y_tictac,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_tictac,df_tictac,y_tictac,cv=10,scoring='f1')))

0.8875460074154458
0.8263199413950838


In [177]:
dt_clf_tictac.fit(X_train_tictac,y_train_tictac)
sel_tictac = SelectFromModel(dt_clf_tictac,prefit=True)
# sel_tictac.fit(df_tictac,y_tictac)
print("features selected: ",sum(sel_tictac.get_support()))
selected_feat_df_tictac = df_tictac.loc[:,sel_tictac.get_support()]
# dt_clf_tictac.fit(X_train_tictac,y_train_tictac)
selected_feat_df_tictac['dt_output']=dt_clf_tictac.predict(df_tictac)
print(np.mean(cross_val_score(svm_clf_tictac,selected_feat_df_tictac,y_tictac,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_tictac,selected_feat_df_tictac,y_tictac,cv=10,scoring='f1')))

features selected:  13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9823229788388497
0.9750876388175389


# Twonorm

In [179]:
df_twonorm = pd.read_csv(os.path.join(PATH,"twonorm/twonorm.csv"))

y_twonorm = df_twonorm['Class']
df_twonorm = df_twonorm.drop(['Class'],axis=1)
y_twonorm = y_twonorm.map({1:0,2:1})

X_train_twonorm,X_test_twonorm,y_train_twonorm,y_test_twonorm = train_test_split(df_twonorm,y_twonorm,random_state=42)

In [180]:
dt_clf_twonorm  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_twonorm,df_twonorm,y_twonorm,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_twonorm,df_twonorm,y_twonorm,cv=10,scoring='f1')))

0.8377012568330849
0.8376408221948891


In [182]:
svm_clf_twonorm = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_twonorm,df_twonorm,y_twonorm,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_twonorm,df_twonorm,y_twonorm,cv=10,scoring='f1')))

0.9767547220759243
0.9766595710608416


In [183]:
dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
sel_twonorm = SelectFromModel(dt_clf_twonorm,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_twonorm.get_support()))
selected_feat_df_twonorm = df_twonorm.loc[:,sel_twonorm.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_twonorm['dt_output']=dt_clf_twonorm.predict(df_twonorm)
print(np.mean(cross_val_score(svm_clf_twonorm,selected_feat_df_twonorm,y_twonorm,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_twonorm,selected_feat_df_twonorm,y_twonorm,cv=10,scoring='f1')))

features selected:  7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9636495200201489
0.9636400821947365


# Nomao

In [27]:
ls = [49,50,51,52,53,54,57,58,59,60,61,62,65,66,67,68,69,70,73,74,75,76,77,78,97,98,99,101,102,103,105,106,107,117]
dtype_dict = {}
for l in ls:
    dtype_dict[l]='float64'
df_nomao = pd.read_csv(os.path.join(PATH,"nomao/Nomao.data"),header=None,low_memory=False)

df_nomao = df_nomao.replace('?',np.NAN)
df_nomao = df_nomao.dropna(thresh=df_nomao.shape[0]*0.6,how='all',axis=1) #dropping columns with more than 60 percent NaN values
df_nomao = df_nomao.dropna(axis=0)#dropping rows with NaN values
df_nomao = df_nomao.reset_index(drop=True)

y_nomao = df_nomao[119]
df_nomao = df_nomao.drop([0,119],axis=1)
y_nomao = y_nomao.map({1:0,-1:1})

df_nomao = df_nomao.astype(dtype_dict)
df_nomao = pd.get_dummies(df_nomao)

X_train_nomao,X_test_nomao,y_train_nomao,y_test_nomao = train_test_split(df_nomao,y_nomao,random_state=42)

In [3]:
dt_clf_nomao  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_nomao,df_nomao,y_nomao,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_nomao,df_nomao,y_nomao,cv=10,scoring='f1')))
print(np.mean(cross_val_score(dt_clf_nomao,df_nomao,y_nomao,cv=10,scoring='roc_auc')))

0.9454955826607426
0.8660396829673994
0.9254648376328705


In [4]:
svm_clf_nomao = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_nomao,df_nomao,y_nomao,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_nomao,df_nomao,y_nomao,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_nomao,df_nomao,y_nomao,cv=10,scoring='roc_auc')))

0.9560566847050787
0.8849356377055422
0.9916763617642657


In [5]:
dt_clf_nomao.fit(X_train_nomao,y_train_nomao)
sel_nomao = SelectFromModel(dt_clf_nomao,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_nomao.get_support()))
selected_feat_df_nomao = df_nomao.loc[:,sel_nomao.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_nomao['dt_output']=dt_clf_nomao.predict(df_nomao)
print(np.mean(cross_val_score(svm_clf_nomao,selected_feat_df_nomao,y_nomao,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_nomao,selected_feat_df_nomao,y_nomao,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_nomao,selected_feat_df_nomao,y_nomao,cv=10,scoring='roc_auc')))

features selected:  12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9912298671101805
0.977713207562967
0.9946628417905343


# Credit-card

In [29]:
df_credit_card = pd.read_excel(os.path.join(PATH,"credit-card/clients.xls"))
df_credit_card = df_credit_card.drop([0],axis=0)

df_credit_card_gender = pd.get_dummies(df_credit_card['X2'],prefix='X2')
df_credit_card_education = pd.get_dummies(df_credit_card['X3'],prefix='X3')
df_credit_card_marital_status = pd.get_dummies(df_credit_card['X4'],prefix='X4')

df_credit_card = pd.concat([df_credit_card,df_credit_card_gender,df_credit_card_education,df_credit_card_marital_status],axis=1)

y_credit_card = df_credit_card['Y']
y_credit_card = y_credit_card.astype(int)
df_credit_card = df_credit_card.drop(['Unnamed: 0','X2','X3','X4','Y'],axis=1)

X_train_credit_card,X_test_credit_card,y_train_credit_card,y_test_credit_card = train_test_split(df_credit_card,y_credit_card,random_state=42)

In [7]:
dt_clf_credit_card  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_credit_card,df_credit_card,y_credit_card,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_credit_card,df_credit_card,y_credit_card,cv=10,scoring='f1')))
print(np.mean(cross_val_score(dt_clf_credit_card,df_credit_card,y_credit_card,cv=10,scoring='roc_auc')))

0.7282680648223775
0.40294084361827043
0.6160577704853407


In [8]:
svm_clf_credit_card = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_credit_card,df_credit_card,y_credit_card,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_credit_card,df_credit_card,y_credit_card,cv=10,scoring='f1_micro')))
print(np.mean(cross_val_score(svm_clf_credit_card,df_credit_card,y_credit_card,cv=10,scoring='roc_auc')))

0.7788000247851878
0.778800024785188
0.5653757220427703


In [9]:
dt_clf_credit_card.fit(X_train_credit_card,y_train_credit_card)
sel_credit_card = SelectFromModel(dt_clf_credit_card,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_credit_card.get_support()))
selected_feat_df_credit_card = df_credit_card.loc[:,sel_credit_card.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_credit_card['dt_output']=dt_clf_credit_card.predict(df_credit_card)
print(np.mean(cross_val_score(svm_clf_credit_card,selected_feat_df_credit_card,y_credit_card,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_credit_card,selected_feat_df_credit_card,y_credit_card,cv=10,scoring='f1_micro')))
print(np.mean(cross_val_score(svm_clf_credit_card,selected_feat_df_credit_card,y_credit_card,cv=10,scoring='roc_auc')))

features selected:  16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.7788000247851878
0.778800024785188
0.553004923029219


# Occupancy

In [31]:
df_occupancy = pd.read_csv(os.path.join(PATH,"occupancy/datatraining.txt"))
df_occupancy_test = pd.read_csv(os.path.join(PATH,"occupancy/datatest.txt"))
df_occupancy_test2 = pd.read_csv(os.path.join(PATH,"occupancy/datatest2.txt"))

df_occupancy = df_occupancy.append([df_occupancy_test,df_occupancy_test2],ignore_index=True)

y_occupancy = df_occupancy['Occupancy']
df_occupancy = df_occupancy.drop(['date','Occupancy'],axis=1)

X_train_occupancy,X_test_occupancy,y_train_occupancy,y_test_occupancy = train_test_split(df_occupancy,y_occupancy,random_state=42)

In [11]:
dt_clf_occupancy  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_occupancy,df_occupancy,y_occupancy,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_occupancy,df_occupancy,y_occupancy,cv=10,scoring='f1')))
print(np.mean(cross_val_score(dt_clf_occupancy,df_occupancy,y_occupancy,cv=10,scoring='roc_auc')))

0.9302042801556422
0.8714430439318059
0.9264141282998768


In [12]:
svm_clf_occupancy = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_occupancy,df_occupancy,y_occupancy,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_occupancy,df_occupancy,y_occupancy,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_occupancy,df_occupancy,y_occupancy,cv=10,scoring='roc_auc')))

0.9848735408560312
0.9689296489990534
0.9925464895635674


In [13]:
dt_clf_occupancy.fit(X_train_occupancy,y_train_occupancy)
sel_occupancy = SelectFromModel(dt_clf_occupancy,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_occupancy.get_support()))
selected_feat_df_occupancy = df_occupancy.loc[:,sel_occupancy.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_occupancy['dt_output']=dt_clf_occupancy.predict(df_occupancy)
print(np.mean(cross_val_score(svm_clf_occupancy,selected_feat_df_occupancy,y_occupancy,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_occupancy,selected_feat_df_occupancy,y_occupancy,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_occupancy,selected_feat_df_occupancy,y_occupancy,cv=10,scoring='roc_auc')))

features selected:  1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9871108949416343
0.9732160987283581
0.9910883185192583


# MAGIC GAMMA

In [33]:
df_magic_gamma = pd.read_csv(os.path.join(PATH,"magic_gamma/magic04.data"),header=None)
y_magic_gamma = df_magic_gamma[10]
df_magic_gamma = df_magic_gamma.drop([10],axis=1)
y_magic_gamma = y_magic_gamma.map({'g':0,'h':1})

X_train_magic_gamma,X_test_magic_gamma,y_train_magic_gamma,y_test_magic_gamma = train_test_split(df_magic_gamma,y_magic_gamma,random_state=42)

In [15]:
dt_clf_magic_gamma  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_magic_gamma,df_magic_gamma,y_magic_gamma,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_magic_gamma,df_magic_gamma,y_magic_gamma,cv=10,scoring='f1')))
print(np.mean(cross_val_score(dt_clf_magic_gamma,df_magic_gamma,y_magic_gamma,cv=10,scoring='roc_auc')))

0.8127762259144067
0.7350038349789894
0.7957782661403531


In [16]:
svm_clf_magic_gamma = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_magic_gamma,df_magic_gamma,y_magic_gamma,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_magic_gamma,df_magic_gamma,y_magic_gamma,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_magic_gamma,df_magic_gamma,y_magic_gamma,cv=10,scoring='roc_auc')))

0.82455427169318
0.7011136325207061
0.8722524537956021


In [17]:
dt_clf_magic_gamma.fit(X_train_magic_gamma,y_train_magic_gamma)
sel_magic_gamma = SelectFromModel(dt_clf_magic_gamma,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_magic_gamma.get_support()))
selected_feat_df_magic_gamma = df_magic_gamma.loc[:,sel_magic_gamma.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_magic_gamma['dt_output']=dt_clf_magic_gamma.predict(df_magic_gamma)
print(np.mean(cross_val_score(svm_clf_magic_gamma,selected_feat_df_magic_gamma,y_magic_gamma,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_magic_gamma,selected_feat_df_magic_gamma,y_magic_gamma,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_magic_gamma,selected_feat_df_magic_gamma,y_magic_gamma,cv=10,scoring='roc_auc')))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8672457646010768
0.7843554602265531
0.9574159240154202


# HTRU2

In [35]:
df_htru = pd.read_csv(os.path.join(PATH,"htru2/HTRU_2.csv"),header=None)
y_htru = df_htru[8]
df_htru = df_htru.drop([8],axis=1)

X_train_htru,X_test_htru,y_train_htru,y_test_htru = train_test_split(df_htru,y_htru,random_state=42)

In [19]:
dt_clf_htru  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_htru,df_htru,y_htru,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_htru,df_htru,y_htru,cv=10,scoring='f1')))
print(np.mean(cross_val_score(dt_clf_htru,df_htru,y_htru,cv=10,scoring='roc_auc')))

0.9680966217989576
0.8276122177631409
0.9091853245257322


In [20]:
svm_clf_htru = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_htru,df_htru,y_htru,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_htru,df_htru,y_htru,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_htru,df_htru,y_htru,cv=10,scoring='roc_auc')))

0.9724545386374712
0.8317286070969198
0.9483304043186728


In [21]:
dt_clf_htru.fit(X_train_htru,y_train_htru)
sel_htru = SelectFromModel(dt_clf_htru,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_htru.get_support()))
selected_feat_df_htru = df_htru.loc[:,sel_htru.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_htru['dt_output']=dt_clf_htru.predict(df_htru)
print(np.mean(cross_val_score(svm_clf_htru,selected_feat_df_htru,y_htru,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_htru,selected_feat_df_htru,y_htru,cv=10,scoring='f1')))
print(np.mean(cross_val_score(svm_clf_htru,selected_feat_df_htru,y_htru,cv=10,scoring='roc_auc')))

features selected:  1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9927925462112407
0.9607016717923316
0.9804348875499137


# EEG

In [37]:
df_eeg = pd.DataFrame(loadarff(os.path.join(PATH,"eeg/EEG Eye State.arff"))[0])
y_eeg = df_eeg['eyeDetection']
y_eeg = y_eeg.map({b'0':0,b'1':1})
df_eeg = df_eeg.drop(['eyeDetection'],axis=1)
X_train_eeg,X_test_eeg,y_train_eeg,y_test_eeg = train_test_split(df_eeg,y_eeg,random_state=42)

In [23]:
dt_clf_eeg  = DecisionTreeClassifier(random_state=42)
print(np.mean(cross_val_score(dt_clf_eeg,df_eeg,y_eeg,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(dt_clf_eeg,df_eeg,y_eeg,cv=10,scoring='f1')))
print(np.mean(cross_val_score(dt_clf_eeg,df_eeg,y_eeg,cv=10,scoring='roc_auc')))

0.54324731461514
0.48806834127111676
0.540393104497206


In [24]:
svm_clf_eeg = SVC(random_state=42,kernel='rbf',gamma='scale')
print(np.mean(cross_val_score(svm_clf_eeg,df_eeg,y_eeg,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_eeg,df_eeg,y_eeg,cv=10,scoring='f1_micro')))
print(np.mean(cross_val_score(svm_clf_eeg,df_eeg,y_eeg,cv=10,scoring='roc_auc')))

0.5512016157907329
0.5512016157907329
0.45834126902651223


In [25]:
dt_clf_eeg.fit(X_train_eeg,y_train_eeg)
sel_eeg = SelectFromModel(dt_clf_eeg,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_eeg.get_support()))
selected_feat_df_eeg = df_eeg.loc[:,sel_eeg.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_eeg['dt_output']=dt_clf_eeg.predict(df_eeg)
print(np.mean(cross_val_score(svm_clf_eeg,selected_feat_df_eeg,y_eeg,cv=10,scoring='accuracy')))
print(np.mean(cross_val_score(svm_clf_eeg,selected_feat_df_eeg,y_eeg,cv=10,scoring='f1_micro')))
print(np.mean(cross_val_score(svm_clf_eeg,selected_feat_df_eeg,y_eeg,cv=10,scoring='roc_auc')))

features selected:  5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.5512016157907329
0.5512016157907329
0.5142706985920372
