# classification

In [2]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(42)
from sklearn.preprocessing import LabelEncoder
import re,os

#feature selection
from sklearn.feature_selection import SelectFromModel
from imblearn.datasets import fetch_datasets

#model selection imports
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold

#algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

#metrics
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score

PATH = "data/"

# pima diabetes

In [3]:
df = pd.read_csv(os.path.join(PATH,"pima/diabetes.csv"))
y = df['Outcome']
df = df.drop(['Outcome'],axis=1)
X_train,X_test,y_train,y_test = train_test_split(df,y,random_state=42,test_size=0.3)

In [4]:
dt_clf = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf,df,y,cv=10,scoring='accuracy'))

0.7018284347231716

In [5]:
svm_clf = SVC(random_state=42,gamma='scale')
np.mean(cross_val_score(svm_clf,df,y,cv=10,scoring='accuracy'))

0.7382775119617225

In [6]:
dt_clf.fit(X_train,y_train)
sel = SelectFromModel(dt_clf,prefit=True)
# sel.fit(df,y)
print("features selected: ",sum(sel.get_support()))
selected_feat_df = df.loc[:,sel.get_support()]
# dt_clf.fit(X_train,y_train)
selected_feat_df['dt_output']=dt_clf.predict(df)
np.mean(cross_val_score(svm_clf,selected_feat_df,y,cv=10,scoring='accuracy'))

features selected:  3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.7747095010252905

# SPECT images

In [8]:
df_spect = pd.read_csv(os.path.join(PATH,"spect/SPECT.train"),header=None)
y_train_spect = df_spect[0]
X_train_spect = df_spect.drop([0],axis=1)
df_spect_test = pd.read_csv(os.path.join(PATH,"spect/SPECT.test"),header=None)
y_test_spect = df_spect_test[0]
X_test_spect = df_spect_test.drop([0],axis=1)
df_spect = df_spect.append(df_spect_test,ignore_index=True)
df_spect = df_spect.drop([0],axis=1)
y_spect = y_train_spect.append(y_test_spect,ignore_index=True)

In [12]:
dt_clf_spect = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_spect,df_spect,y_spect,cv=10,scoring='accuracy'))

0.7598087098087098

In [13]:
svm_clf_spect = SVC(random_state=42,gamma='scale')
np.mean(cross_val_score(svm_clf_spect,df_spect,y_spect,cv=10,scoring='accuracy'))

0.823819698819699

In [15]:
dt_clf_spect.fit(X_train_spect,y_train_spect)
sel = SelectFromModel(dt_clf_spect,prefit=True)
# sel.fit(df_spect,y_spect)
print("features selected: ",sum(sel.get_support()))
selected_feat_df_spect = df_spect.loc[:,sel.get_support()]
# dt_clf_spect.fit(X_train_spect,y_train_spect)
selected_feat_df_spect['dt_output']=dt_clf_spect.predict(df_spect)
np.mean(cross_val_score(svm_clf_spect,selected_feat_df_spect,y_spect,cv=10,scoring='accuracy'))

features selected:  6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8154558404558404

# Breast Cancer

In [16]:
df_wcancer = pd.read_csv(os.path.join(PATH,"breast_cancer/breast_cancer_data.csv"))
df_wcancer = df_wcancer.drop(['id','Unnamed: 32'],axis=1)
y_wcancer = df_wcancer['diagnosis']
le_wcancer = LabelEncoder()
y_wcancer = le_wcancer.fit_transform(y_wcancer)
df_wcancer = df_wcancer.drop(['diagnosis'],axis=1)
X_train_wcancer,X_test_wcancer,y_train_wcancer,y_test_wcancer = train_test_split(df_wcancer,y_wcancer,random_state=42)

In [18]:
dt_clf_wcancer = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_wcancer,df_wcancer,y_wcancer,cv=10,scoring='accuracy'))

0.9281220292109584

In [19]:
svm_clf_wcancer = SVC(random_state=42,gamma='scale')
np.mean(cross_val_score(svm_clf_wcancer,df_wcancer,y_wcancer,cv=10,scoring='accuracy'))

0.9367999740731138

In [20]:
dt_clf_wcancer.fit(X_train_wcancer,y_train_wcancer)
sel_wcancer = SelectFromModel(dt_clf_wcancer,prefit=True)
# sel_wcancer.fit(X_train_wcancer,y_train_wcancer)
print("features selected: ",sum(sel_wcancer.get_support()))
selected_feat_df_wcancer = df_wcancer.loc[:,sel_wcancer.get_support()]
# dt_clf_wcancer.fit(X_train_wcancer,y_train_wcancer)
selected_feat_df_wcancer['dt_output']=dt_clf_wcancer.predict(df_wcancer)
np.mean(cross_val_score(svm_clf_wcancer,selected_feat_df_wcancer,y_wcancer,cv=10,scoring='accuracy'))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9158996629504796

# Statlog Heart 

In [23]:
df_heart = pd.read_csv(os.path.join(PATH,"statlog_heart/heart.dat"),header=None,sep=" ")
y_heart = df_heart[13]
y_heart = y_heart.map({1:0,2:1})
df_heart = df_heart.drop([13],axis=1)
X_train_heart,X_test_heart,y_train_heart,y_test_heart = train_test_split(df_heart,y_heart,random_state=42)

In [25]:
dt_clf_heart = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_heart,df_heart,y_heart,cv=10,scoring='accuracy'))

0.7444444444444444

In [26]:
svm_clf_heart = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_heart,df_heart,y_heart,cv=10,scoring='accuracy'))

0.6814814814814815

In [27]:
dt_clf_heart.fit(X_train_heart,y_train_heart)
sel_heart = SelectFromModel(dt_clf_heart,prefit=True)
# sel_heart.fit(df_heart,y_heart)
print("features selected: ",sum(sel_heart.get_support()))
selected_feat_df_heart = df_heart.loc[:,sel_heart.get_support()]
# dt_clf_heart.fit(X_train_heart,y_train_heart)
selected_feat_df_heart['dt_output']=dt_clf_heart.predict(df_heart)
np.mean(cross_val_score(svm_clf_heart,selected_feat_df_heart,y_heart,cv=10,scoring='accuracy'))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.6703703703703704

# Hepatitis

In [28]:
df_hepat = pd.read_csv(os.path.join(PATH,"hepatitis/hepatitis.data"),header=None,sep=",")
df_hepat = df_hepat.replace('?',np.NAN)
df_hepat = df_hepat.dropna(axis=0)
df_hepat = df_hepat.reset_index(drop=True)
df_hepat = pd.get_dummies(df_hepat)

y_hepat = df_hepat[0]
df_hepat = df_hepat.drop([0],axis=1)
y_hepat = y_hepat.map({1:0,2:1})
X_train_hepat,X_test_hepat,y_train_hepat,y_test_hepat = train_test_split(df_hepat,y_hepat,random_state=42)

In [29]:
dt_clf_hepat = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_hepat,df_hepat,y_hepat,cv=10,scoring='accuracy'))

0.7089285714285715

In [30]:
svm_clf_hepat = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_hepat,df_hepat,y_hepat,cv=10,scoring='accuracy'))

0.8404761904761904

In [31]:
dt_clf_hepat.fit(X_train_hepat,y_train_hepat)
sel_hepat = SelectFromModel(dt_clf_hepat,prefit=True)
# sel_hepat.fit(df_hepat,y_hepat)
print("features selected: ",sum(sel_hepat.get_support()))
selected_feat_df_hepat = df_hepat.loc[:,sel_hepat.get_support()]
# dt_clf_hepat.fit(X_train_hepat,y_train_hepat)
selected_feat_df_hepat['dt_output']=dt_clf_hepat.predict(df_hepat)
np.mean(cross_val_score(svm_clf_hepat,selected_feat_df_hepat,y_hepat,cv=10,scoring='accuracy'))

features selected:  7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8404761904761904

# HypoThyroid

In [32]:
df_hypo = pd.read_csv(os.path.join(PATH,"hypothyroid/hypothyroid.data"),header=None)

df_hypo = df_hypo.replace('?',np.NAN)
df_hypo = df_hypo.drop([25],axis=1)
df_hypo = df_hypo.dropna(axis=0)
df_hypo = df_hypo.reset_index(drop=True)

y_hypo = df_hypo[0]
y_hypo = y_hypo.map({'hypothyroid':0,'negative':1})
df_hypo = df_hypo.drop([0],axis=1)
df_hypo = pd.get_dummies(df_hypo)
X_train_hypo,X_test_hypo,y_train_hypo,y_test_hypo = train_test_split(df_hypo,y_hypo,random_state=42)

In [33]:
dt_clf_hypo = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_hypo,df_hypo,y_hypo,cv=10,scoring='accuracy'))

0.9445020125503139

In [34]:
svm_clf_hypo = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_hypo,df_hypo,y_hypo,cv=10,scoring='accuracy'))

0.9390043751093776

In [35]:
dt_clf_hypo.fit(X_train_hypo,y_train_hypo)
sel_hypo = SelectFromModel(dt_clf_hypo,prefit=True)
# sel_hypo.fit(df_hypo,y_hypo)
print("features selected: ",sum(sel_hypo.get_support()))
selected_feat_df_hypo = df_hypo.loc[:,sel_hypo.get_support()]
# dt_clf_hypo.fit(X_train_hypo,y_train_hypo)
selected_feat_df_hypo['dt_output']=dt_clf_hypo.predict(df_hypo)
np.mean(cross_val_score(svm_clf_hypo,selected_feat_df_hypo,y_hypo,cv=10,scoring='accuracy'))

features selected:  72


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9900074376859422

# Hungarian-14-Heart

In [36]:
df_hung = pd.read_csv(os.path.join(PATH,"hungarian_heart/hungarian.data"),header=None)

df_hung = df_hung.replace('?',np.NAN)
df_hung = df_hung.drop([10,11,12],axis=1)
df_hung = df_hung.dropna(axis=0)
df_hung = df_hung.reset_index(drop=True)

y_hung = df_hung[13]
df_hung = df_hung.drop([13],axis=1)
df_hung = pd.get_dummies(df_hung)
X_train_hung,X_test_hung,y_train_hung,y_test_hung = train_test_split(df_hung,y_hung,random_state=42)

In [37]:
dt_clf_hung = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_hung,df_hung,y_hung,cv=10,scoring='accuracy'))

0.699925925925926

In [38]:
svm_clf_hung = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_hung,df_hung,y_hung,cv=10,scoring='accuracy'))

0.6284273504273503

In [39]:
dt_clf_hung.fit(X_train_hung,y_train_hung)
sel_hung = SelectFromModel(dt_clf_hung,prefit=True)
# sel_hung.fit(df_hung,y_hung)
print("features selected: ",sum(sel_hung.get_support()))
selected_feat_df_hung = df_hung.loc[:,sel_hung.get_support()]
# dt_clf_hung.fit(X_train_hung,y_train_hung)
selected_feat_df_hung['dt_output']=dt_clf_hung.predict(df_hung)
np.mean(cross_val_score(svm_clf_hung,selected_feat_df_hung,y_hung,cv=10,scoring='accuracy'))

features selected:  32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9425584045584046

# Ionosphere

In [40]:
df_ion = pd.read_csv(os.path.join(PATH,"ionosphere/ionosphere.data"),header=None,sep=',')
y_ion = df_ion[34]
df_ion = df_ion.drop([34],axis=1)
y_ion = y_ion.map({'g':0,'b':1})
X_train_ion,X_test_ion,y_train_ion,y_test_ion = train_test_split(df_ion,y_ion,random_state=42)

In [41]:
dt_clf_ion = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_ion,df_ion,y_ion,cv=10,scoring='accuracy'))

0.8647152194211017

In [42]:
svm_clf_ion = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_ion,df_ion,y_ion,cv=10,scoring='accuracy'))

0.9406115779645191

In [43]:
dt_clf_ion.fit(X_train_ion,y_train_ion)
sel_ion = SelectFromModel(dt_clf_ion,prefit=True)
# sel_ion.fit(df_ion,y_ion)
print("features selected: ",sum(sel_ion.get_support()))
selected_feat_df_ion = df_ion.loc[:,sel_ion.get_support()]
# dt_clf_ion.fit(X_train_ion,y_train_ion)
selected_feat_df_ion['dt_output']=dt_clf_ion.predict(df_ion)
np.mean(cross_val_score(svm_clf_ion,selected_feat_df_ion,y_ion,cv=10,scoring='accuracy'))

features selected:  6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9776143790849673

# Sonar

In [44]:
df_sonar = pd.read_csv(os.path.join(PATH,"sonar/sonar.all-data"),header=None)
y_sonar = df_sonar[60]
df_sonar = df_sonar.drop([60],axis=1)
y_sonar = y_sonar.map({'R':0,'M':1})
X_train_sonar,X_test_sonar,y_train_sonar,y_test_sonar = train_test_split(df_sonar,y_sonar,random_state=42)

In [45]:
dt_clf_sonar = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_sonar,df_sonar,y_sonar,cv=20,scoring='accuracy'))

0.6531313131313131

In [46]:
svm_clf_sonar = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_sonar,df_sonar,y_sonar,cv=10,scoring='accuracy'))

0.6161255411255411

In [47]:
dt_clf_sonar.fit(X_train_sonar,y_train_sonar)
sel_sonar = SelectFromModel(dt_clf_sonar,prefit=True)
# sel_sonar.fit(df_sonar,y_sonar)
print("features selected: ",sum(sel_sonar.get_support()))
selected_feat_df_sonar = df_sonar.loc[:,sel_sonar.get_support()]
# dt_clf_sonar.fit(X_train_sonar,y_train_sonar)
selected_feat_df_sonar['dt_output']=dt_clf_sonar.predict(df_sonar)
np.mean(cross_val_score(svm_clf_sonar,selected_feat_df_sonar,y_sonar,cv=10,scoring='accuracy'))

features selected:  14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9285281385281385

# Vote

In [48]:
df_vote = pd.read_csv(os.path.join(PATH,"vote/house-votes-84.data"),header=None)

df_vote = df_vote.replace('?',np.NAN)
df_vote = df_vote.dropna(axis=0)
df_vote = df_vote.reset_index(drop=True)

y_vote = df_vote[0]
df_vote = df_vote.drop([0],axis=1)
y_vote = y_vote.map({'democrat':0,'republican':1})

df_vote = pd.get_dummies(df_vote)
X_train_vote,X_test_vote,y_train_vote,y_test_vote = train_test_split(df_vote,y_vote,random_state=42)

In [49]:
dt_clf_vote = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_vote,df_vote,y_vote,cv=10,scoring='accuracy'))

0.9347002635046113

In [50]:
svm_clf_vote = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_vote,df_vote,y_vote,cv=10,scoring='accuracy'))

0.96933465085639

In [51]:
dt_clf_vote.fit(X_train_vote,y_train_vote)
sel_vote = SelectFromModel(dt_clf_vote,prefit=True)
# sel_vote.fit(df_vote,y_vote)
print("features selected: ",sum(sel_vote.get_support()))
selected_feat_df_vote = df_vote.loc[:,sel_vote.get_support()]
# dt_clf_vote.fit(X_train_vote,y_train_vote)
selected_feat_df_vote['dt_output']=dt_clf_vote.predict(df_vote)
np.mean(cross_val_score(svm_clf_vote,selected_feat_df_vote,y_vote,cv=10,scoring='accuracy'))

features selected:  2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9827898550724639

# Kr vs Kp

In [52]:
df_krkp = pd.read_csv(os.path.join(PATH,"krvskp/kr-vs-kp.data"),header=None)

y_krkp = df_krkp[36]
y_krkp = y_krkp.map({'won':0,'nowin':1})
df_krkp = df_krkp.drop([36],axis=1)

df_krkp = pd.get_dummies(df_krkp)
X_train_krkp,X_test_krkp,y_train_krkp,y_test_krkp = train_test_split(df_krkp,y_krkp,random_state=42)

In [53]:
dt_clf_krkp = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_krkp,df_krkp,y_krkp,cv=10,scoring='accuracy'))

0.9783835898838745

In [54]:
svm_clf_krkp = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_krkp,df_krkp,y_krkp,cv=10,scoring='accuracy'))

0.9258444295755209

In [55]:
dt_clf_krkp.fit(X_train_krkp,y_train_krkp)
sel_krkp = SelectFromModel(dt_clf_krkp,prefit=True)
# sel_krkp.fit(df_krkp,y_krkp)
print("features selected: ",sum(sel_krkp.get_support()))
selected_feat_df_krkp = df_krkp.loc[:,sel_krkp.get_support()]
# dt_clf_krkp.fit(X_train_krkp,y_train_krkp)
selected_feat_df_krkp['dt_output']=dt_clf_krkp.predict(df_krkp)
np.mean(cross_val_score(svm_clf_krkp,selected_feat_df_krkp,y_krkp,cv=10,scoring='accuracy'))

features selected:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9971796506870921

# Thyroid-Sick

In [56]:
sick = fetch_datasets()['thyroid_sick']
df_sick = pd.DataFrame(sick.data)
y_sick = sick.target
y_sick = pd.Series(y_sick).map({-1:0,1:1})
X_train_sick,X_test_sick,y_train_sick,y_test_sick = train_test_split(df_sick,y_sick,random_state=42)

In [57]:
dt_clf_sick = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_sick,df_sick,y_sick,cv=10,scoring='accuracy'))

0.9870054520131856

In [58]:
svm_clf_sick = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_sick,df_sick,y_sick,cv=10,scoring='accuracy'))

0.9387603843704291

In [59]:
dt_clf_sick.fit(X_train_sick,y_train_sick)
sel_sick = SelectFromModel(dt_clf_sick,prefit=True)
# sel_sick.fit(df_sick,y_sick)
print("features selected: ",sum(sel_sick.get_support()))
selected_feat_df_sick = df_sick.loc[:,sel_sick.get_support()]
# dt_clf_sick.fit(X_train_sick,y_train_sick)
selected_feat_df_sick['dt_output']=dt_clf_sick.predict(df_sick)
np.mean(cross_val_score(svm_clf_sick,selected_feat_df_sick,y_sick,cv=10,scoring='accuracy'))

features selected:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9411448527816466

# Monks1

In [60]:
df_monks1 = pd.DataFrame()
df_monks1_train = pd.read_csv(os.path.join(PATH,"monks/monks-1.train"),header=None,sep=" ")
df_monks1_test = pd.read_csv(os.path.join(PATH,"monks/monks-1.test"),header=None,sep=" ")

df_monks1 = df_monks1.append(df_monks1_train,ignore_index=True)
df_monks1 = df_monks1.append(df_monks1_test,ignore_index=True)

y_monks1 = df_monks1[1]
df_monks1 = df_monks1.drop([0,1,8],axis=1)

X_train_monks1,X_test_monks1,y_train_monks1,y_test_monks1 = train_test_split(df_monks1,y_monks1,random_state=42)

In [61]:
dt_clf_monks1 = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_monks1,df_monks1,y_monks1,cv=10,scoring='accuracy'))

1.0

In [62]:
svm_clf_monks1 = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_monks1,df_monks1,y_monks1,cv=10,scoring='accuracy'))

0.8850529100529101

In [63]:
dt_clf_monks1.fit(X_train_monks1,y_train_monks1)
sel_monks1 = SelectFromModel(dt_clf_monks1,prefit=True)
# sel_monks1.fit(df_monks1,y_monks1)
print("features selected: ",sum(sel_monks1.get_support()))
selected_feat_df_monks1 = df_monks1.loc[:,sel_monks1.get_support()]
# dt_clf_monks1.fit(X_train_monks1,y_train_monks1)
selected_feat_df_monks1['dt_output']=dt_clf_monks1.predict(df_monks1)
np.mean(cross_val_score(svm_clf_monks1,selected_feat_df_monks1,y_monks1,cv=10,scoring='accuracy'))

features selected:  3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9871693121693121

# Monks2

In [64]:
df_monks2 = pd.DataFrame()
df_monks2_train = pd.read_csv(os.path.join(PATH,"monks/monks-2.train"),header=None,sep=" ")
df_monks2_test = pd.read_csv(os.path.join(PATH,"monks/monks-2.test"),header=None,sep=" ")

df_monks2 = df_monks2.append(df_monks2_train,ignore_index=True)
df_monks2 = df_monks2.append(df_monks2_test,ignore_index=True)

y_monks2 = df_monks2[1]
df_monks2 = df_monks2.drop([0,1,8],axis=1)

X_train_monks2,X_test_monks2,y_train_monks2,y_test_monks2 = train_test_split(df_monks2,y_monks2,random_state=42)

In [65]:
dt_clf_monks2 = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_monks2,df_monks2,y_monks2,cv=10,scoring='accuracy'))

0.9664925442252477

In [66]:
svm_clf_monks2 = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_monks2,df_monks2,y_monks2,cv=10,scoring='accuracy'))

0.6506362878577383

In [67]:
dt_clf_monks2.fit(X_train_monks2,y_train_monks2)
sel_monks2 = SelectFromModel(dt_clf_monks2,prefit=True)
# sel_monks2.fit(df_monks2,y_monks2)
print("features selected: ",sum(sel_monks2.get_support()))
selected_feat_df_monks2 = df_monks2.loc[:,sel_monks2.get_support()]
# dt_clf_monks2.fit(X_train_monks2,y_train_monks2)
selected_feat_df_monks2['dt_output']=dt_clf_monks2.predict(df_monks2)
np.mean(cross_val_score(svm_clf_monks2,selected_feat_df_monks2,y_monks2,cv=10,scoring='accuracy'))

features selected:  3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9916092433083264

# Monks3

In [68]:
df_monks3 = pd.DataFrame()
df_monks3_train = pd.read_csv(os.path.join(PATH,"monks/monks-3.train"),header=None,sep=" ")
df_monks3_test = pd.read_csv(os.path.join(PATH,"monks/monks-3.test"),header=None,sep=" ")

df_monks3 = df_monks3.append(df_monks3_train,ignore_index=True)
df_monks3 = df_monks3.append(df_monks3_test,ignore_index=True)

y_monks3 = df_monks3[1]
df_monks3 = df_monks3.drop([0,1,8],axis=1)

X_train_monks3,X_test_monks3,y_train_monks3,y_test_monks3 = train_test_split(df_monks3,y_monks3,random_state=42)

In [69]:
dt_clf_monks3 = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_monks3,df_monks3,y_monks3,cv=10,scoring='accuracy'))

0.9638924963924964

In [70]:
svm_clf_monks3 = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_monks3,df_monks3,y_monks3,cv=10,scoring='accuracy'))

0.9638912938912938

In [71]:
dt_clf_monks3.fit(X_train_monks3,y_train_monks3)
sel_monks3 = SelectFromModel(dt_clf_monks3,prefit=True)
# sel_monks2.fit(df_monks2,y_monks2)
print("features selected: ",sum(sel_monks3.get_support()))
selected_feat_df_monks3 = df_monks3.loc[:,sel_monks3.get_support()]
# dt_clf_monks2.fit(X_train_monks2,y_train_monks2)
selected_feat_df_monks3['dt_output']=dt_clf_monks3.predict(df_monks3)
np.mean(cross_val_score(svm_clf_monks3,selected_feat_df_monks3,y_monks3,cv=10,scoring='accuracy'))

features selected:  2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9874350649350649

# Mushroom

In [72]:
df_mushroom = pd.read_csv(os.path.join(PATH,"mushroom/mushrooms.csv"))
y_mushroom = df_mushroom['class']
df_mushroom = df_mushroom.drop(['class'],axis=1)
df_mushroom = pd.get_dummies(df_mushroom)
y_mushroom = y_mushroom.map({'p':0,'e':1})
X_train_mushroom,X_test_mushroom,y_train_mushroom,y_test_mushroom = train_test_split(df_mushroom,y_mushroom,random_state=42)

In [73]:
dt_clf_mushroom = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_mushroom,df_mushroom,y_mushroom,cv=10,scoring='accuracy'))

0.9564284446790214

In [74]:
svm_clf_mushroom = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_mushroom,df_mushroom,y_mushroom,cv=10,scoring='accuracy'))

0.9438531993211423

In [75]:
dt_clf_mushroom.fit(X_train_mushroom,y_train_mushroom)
sel_mushroom = SelectFromModel(dt_clf_mushroom,prefit=True)
# sel_mushroom.fit(df_mushroom,y_mushroom)
print("features selected: ",sum(sel_mushroom.get_support()))
selected_feat_df_mushroom = df_mushroom.loc[:,sel_mushroom.get_support()]
# dt_clf_mushroom.fit(X_train_mushroom,y_train_mushroom)
selected_feat_df_mushroom['dt_output']=dt_clf_mushroom.predict(df_mushroom)
np.mean(cross_val_score(svm_clf_mushroom,selected_feat_df_mushroom,y_mushroom,cv=10,scoring='accuracy'))

features selected:  7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


1.0

# Musk1

In [77]:
df_musk1 = pd.read_csv(os.path.join(PATH,"musk1/clean1.data"),header=None)
y_musk1 = df_musk1[168]
df_musk1 = df_musk1.drop([0,1,168],axis=1)
X_train_musk1,X_test_musk1,y_train_musk1,y_test_musk1 = train_test_split(df_musk1,y_musk1,random_state=42)

In [78]:
dt_clf_musk1 = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_musk1,df_musk1,y_musk1,cv=10,scoring='accuracy'))

0.7131109312365094

In [79]:
svm_clf_musk1 = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_musk1,df_musk1,y_musk1,cv=10,scoring='accuracy'))

0.569332022818378

In [80]:
dt_clf_musk1.fit(X_train_musk1,y_train_musk1)
sel_musk1= SelectFromModel(dt_clf_musk1,prefit=True)
# sel_musk1.fit(df_musk1,y_musk1)
print("features selected: ",sum(sel_musk1.get_support()))
selected_feat_df_musk1 = df_musk1.loc[:,sel_musk1.get_support()]
# dt_clf_musk1.fit(X_train_musk1,y_train_musk1)
selected_feat_df_musk1['dt_output']=dt_clf_musk1.predict(df_musk1)
np.mean(cross_val_score(svm_clf_musk1,selected_feat_df_musk1,y_musk1,cv=10,scoring='accuracy'))

features selected:  29


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.5946904872032068

# Musk2

In [81]:
df_musk2 = pd.read_csv(os.path.join(PATH,"musk2/clean2.data"),header=None)
y_musk2 = df_musk2[168]
df_musk2 = df_musk2.drop([0,1,168],axis=1)
X_train_musk2,X_test_musk2,y_train_musk2,y_test_musk2 = train_test_split(df_musk2,y_musk2,random_state=42)

In [82]:
dt_clf_musk2 = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_musk2,df_musk2,y_musk2,cv=10,scoring='accuracy'))

0.7806383091287671

In [83]:
svm_clf_musk2 = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_musk2,df_musk2,y_musk2,cv=10,scoring='accuracy'))

0.8508616605750896

In [84]:
dt_clf_musk2.fit(X_train_musk2,y_train_musk2)
sel_musk2= SelectFromModel(dt_clf_musk2,prefit=True)
# sel_musk2.fit(df_musk2,y_musk2)
print("features selected: ",sum(sel_musk2.get_support()))
selected_feat_df_musk2 = df_musk2.loc[:,sel_musk2.get_support()]
# dt_clf_musk2.fit(X_train_musk2,y_train_musk2)
selected_feat_df_musk2['dt_output']=dt_clf_musk2.predict(df_musk2)
np.mean(cross_val_score(svm_clf_musk2,selected_feat_df_musk2,y_musk2,cv=10,scoring='accuracy'))

features selected:  37


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8537402261894055

# Ozone

In [85]:
df_ozone = pd.read_csv(os.path.join(PATH,"ozone/ozone.data"),header=None)

df_ozone = df_ozone.replace('?',np.NAN)
df_ozone = df_ozone.dropna()

y_ozone = df_ozone[73]
df_ozone = df_ozone.drop([0,73],axis=1)

X_train_ozone,X_test_ozone,y_train_ozone,y_test_ozone = train_test_split(df_ozone,y_ozone,random_state=42)

In [86]:
dt_clf_ozone = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_ozone,df_ozone,y_ozone,cv=10,scoring='accuracy'))

0.892270215689674

In [87]:
svm_clf_ozone = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_ozone,df_ozone,y_ozone,cv=10,scoring='accuracy'))

0.9307046676041688

In [88]:
dt_clf_ozone.fit(X_train_ozone,y_train_ozone)
sel_ozone= SelectFromModel(dt_clf_ozone,prefit=True)
# sel_ozone.fit(df_ozone,y_ozone)
print("features selected: ",sum(sel_ozone.get_support()))
selected_feat_df_ozone = df_ozone.loc[:,sel_ozone.get_support()]
# dt_clf_ozone.fit(X_train_ozone,y_train_ozone)
selected_feat_df_ozone['dt_output']=dt_clf_ozone.predict(df_ozone)
np.mean(cross_val_score(svm_clf_ozone,selected_feat_df_ozone,y_ozone,cv=10,scoring='accuracy'))

features selected:  24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9307046676041688

# Parkinsons

In [89]:
df_parkinsons = pd.read_csv(os.path.join(PATH,"parkinsons/parkinsons.data"))

y_parkinsons = df_parkinsons['status']
df_parkinsons = df_parkinsons.drop(['name','status'],axis=1)

X_train_parkinsons,X_test_parkinsons,y_train_parkinsons,y_test_parkinsons = train_test_split(df_parkinsons,y_parkinsons,random_state=42)

In [90]:
dt_clf_parkinsons = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_parkinsons,df_parkinsons,y_parkinsons,cv=10,scoring='accuracy'))

0.8040058479532164

In [91]:
svm_clf_parkinsons = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_parkinsons,df_parkinsons,y_parkinsons,cv=10,scoring='accuracy'))

0.7637134502923976

In [182]:
dt_clf_parkinsons.fit(X_train_parkinsons,y_train_parkinsons)
sel_parkinsons= SelectFromModel(dt_clf_parkinsons,prefit=True)
# sel_parkinsons.fit(df_parkinsons,y_parkinsons)
print("features selected: ",sum(sel_parkinsons.get_support()))
selected_feat_df_parkinsons = df_parkinsons.loc[:,sel_parkinsons.get_support()]
# dt_clf_parkinsons.fit(X_train_parkinsons,y_train_parkinsons)
selected_feat_df_parkinsons['dt_output']=dt_clf_parkinsons.predict(df_parkinsons)
np.mean(cross_val_score(svm_clf_parkinsons,selected_feat_df_parkinsons,y_parkinsons,cv=10,scoring='f1'))

features selected:  6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.8957600337131515

# Planning

In [92]:
df_planning = pd.read_csv(os.path.join(PATH,"planning/plrx.txt"),sep='\t',header=None)

y_planning = df_planning[12]
df_planning = df_planning.drop([12,13],axis=1)
y_planning = y_planning.map({1.0:0,2.0:1})

X_train_planning,X_test_planning,y_train_planning,y_test_planning = train_test_split(df_planning,y_planning,random_state=42)

In [93]:
dt_clf_planning = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_planning,df_planning,y_planning,cv=10,scoring='accuracy'))

0.5336257309941521

In [94]:
svm_clf_planning = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_planning,df_planning,y_planning,cv=10,scoring='accuracy'))

0.7146198830409357

In [95]:
dt_clf_planning.fit(X_train_planning,y_train_planning)
sel_planning= SelectFromModel(dt_clf_planning,prefit=True)
# sel_planning.fit(df_planning,y_planning)
print("features selected: ",sum(sel_planning.get_support()))
selected_feat_df_planning = df_planning.loc[:,sel_planning.get_support()]
# dt_clf_planning.fit(X_train_planning,y_train_planning)
selected_feat_df_planning['dt_output']=dt_clf_planning.predict(df_planning)
np.mean(cross_val_score(svm_clf_planning,selected_feat_df_planning,y_planning,cv=10,scoring='accuracy'))

features selected:  4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.87953216374269

# Ringnorm

In [96]:
df_ringnorm = pd.read_csv(os.path.join(PATH,"ringnorm/ringnorm.csv"))

y_ringnorm = df_ringnorm['Class']
df_ringnorm = df_ringnorm.drop(['Class'],axis=1)
y_ringnorm = y_ringnorm.map({1:0,2:1})

X_train_ringnorm,X_test_ringnorm,y_train_ringnorm,y_test_ringnorm = train_test_split(df_ringnorm,y_ringnorm,random_state=42)

In [97]:
dt_clf_ringnorm = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_ringnorm,df_ringnorm,y_ringnorm,cv=10,scoring='accuracy'))

0.8759433910580553

In [98]:
svm_clf_ringnorm = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_ringnorm,df_ringnorm,y_ringnorm,cv=10,scoring='accuracy'))

0.504864871774636

In [99]:
dt_clf_ringnorm.fit(X_train_ringnorm,y_train_ringnorm)
sel_ringnorm= SelectFromModel(dt_clf_ringnorm,prefit=True)
# sel_ringnorm.fit(df_ringnorm,y_ringnorm)
print("features selected: ",sum(sel_ringnorm.get_support()))
selected_feat_df_ringnorm = df_ringnorm.loc[:,sel_ringnorm.get_support()]
# dt_clf_ringnorm.fit(X_train_ringnorm,y_train_ringnorm)
selected_feat_df_ringnorm['dt_output']=dt_clf_ringnorm.predict(df_ringnorm)
np.mean(cross_val_score(svm_clf_ringnorm,selected_feat_df_ringnorm,y_ringnorm,cv=10,scoring='accuracy'))

features selected:  8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.504864871774636

# Spambase

In [100]:
df_spam = pd.read_csv(os.path.join(PATH,"spambase/spambase.data"),header=None)

y_spam = df_spam[57]
df_spam = df_spam.drop([57],axis=1)

X_train_spam,X_test_spam,y_train_spam,y_test_spam = train_test_split(df_spam,y_spam,random_state=42)

In [101]:
dt_clf_spam = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_spam,df_spam,y_spam,cv=10,scoring='accuracy'))

0.8993253255696738

In [102]:
svm_clf_spam = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_spam,df_spam,y_spam,cv=10,scoring='accuracy'))

0.7717937980310172

In [103]:
dt_clf_spam.fit(X_train_spam,y_train_spam)
sel_spam = SelectFromModel(dt_clf_spam,prefit=True)
# sel_spam.fit(df_spam,y_spam)
print("features selected: ",sum(sel_spam.get_support()))
selected_feat_df_spam = df_spam.loc[:,sel_spam.get_support()]
# dt_clf_spam.fit(X_train_spam,y_train_spam)
selected_feat_df_spam['dt_output']=dt_clf_spam.predict(df_spam)
np.mean(cross_val_score(svm_clf_spam,selected_feat_df_spam,y_spam,cv=10,scoring='accuracy'))

features selected:  11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.7876648272563136

# Australian Credit

In [104]:
df_auscred = pd.read_csv(os.path.join(PATH,"australian-credit/australian.dat"),header=None,sep=" ")

y_auscred = df_auscred[14]
df_auscred = df_auscred.drop([14],axis=1)

X_train_auscred,X_test_auscred,y_train_auscred,y_test_auscred = train_test_split(df_auscred,y_auscred,random_state=42)

In [105]:
dt_clf_auscred = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_auscred,df_auscred,y_auscred,cv=10,scoring='accuracy'))

0.8070539520155888

In [106]:
svm_clf_auscred = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_auscred,df_auscred,y_auscred,cv=10,scoring='accuracy'))

0.6796078431372549

In [107]:
dt_clf_auscred.fit(X_train_auscred,y_train_auscred)
sel_auscred = SelectFromModel(dt_clf_auscred,prefit=True)
# sel_auscred.fit(df_auscred,y_auscred)
print("features selected: ",sum(sel_auscred.get_support()))
selected_feat_df_auscred = df_auscred.loc[:,sel_auscred.get_support()]
# selected_feat_df_auscred = df_auscred
# dt_clf_auscred.fit(X_train_auscred,y_train_auscred)
selected_feat_df_auscred['dt_output']=dt_clf_auscred.predict(df_auscred)
np.mean(cross_val_score(svm_clf_auscred,selected_feat_df_auscred,y_auscred,cv=10,scoring='accuracy'))

features selected:  2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.9463080014614542

# German credit

In [108]:
df_german = pd.read_csv(os.path.join(PATH,'german-credit/german_credit_data.csv'))

df_german = df_german.dropna()
y_german = df_german['Risk']
df_german = df_german.drop(['Unnamed: 0','Risk'],axis=1)
df_german = pd.get_dummies(df_german)
y_german = y_german.map({'bad':1,'good':0})
X_train_german,X_test_german,y_train_german,y_test_german = train_test_split(df_german,y_german,random_state=42)

In [109]:
dt_clf_german = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_german,df_german,y_german,cv=10,scoring='accuracy'))

0.5816239316239317

In [110]:
svm_clf_german = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_german,df_german,y_german,cv=10,scoring='accuracy'))

0.5898148148148148

In [111]:
dt_clf_german.fit(X_train_german,y_train_german)
sel_german = SelectFromModel(dt_clf_german,prefit=True)
# sel_german.fit(df_german,y_german)
print("features selected: ",sum(sel_german.get_support()))
selected_feat_df_german = df_german.loc[:,sel_german.get_support()]
# selected_feat_df_german = df_german
# dt_clf_german.fit(X_train_german,y_train_german)
selected_feat_df_german['dt_output']=dt_clf_german.predict(df_german)
np.mean(cross_val_score(svm_clf_german,selected_feat_df_german,y_german,cv=10,scoring='accuracy'))

features selected:  5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.5841168091168092

# Tic-Tac-Toe

In [112]:
df_tictac = pd.read_csv(os.path.join(PATH,"tictactoe/tic-tac-toe.data"),header=None)

y_tictac = df_tictac[9]
y_tictac = y_tictac.map({'positive':0,'negative':1})
df_tictac = df_tictac.drop([9],axis=1)

df_tictac = pd.get_dummies(df_tictac)
X_train_tictac,X_test_tictac,y_train_tictac,y_test_tictac = train_test_split(df_tictac,y_tictac,random_state=42)

In [113]:
dt_clf_tictac  = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_tictac,df_tictac,y_tictac,cv=10,scoring='accuracy'))

0.811394239464641

In [114]:
svm_clf_tictac = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_tictac,df_tictac,y_tictac,cv=10,scoring='accuracy'))

0.8875460074154458

In [115]:
dt_clf_tictac.fit(X_train_tictac,y_train_tictac)
sel_tictac = SelectFromModel(dt_clf_tictac,prefit=True)
# sel_tictac.fit(df_tictac,y_tictac)
print("features selected: ",sum(sel_tictac.get_support()))
selected_feat_df_tictac = df_tictac.loc[:,sel_tictac.get_support()]
# dt_clf_tictac.fit(X_train_tictac,y_train_tictac)
selected_feat_df_tictac['dt_output']=dt_clf_tictac.predict(df_tictac)
np.mean(cross_val_score(svm_clf_tictac,selected_feat_df_tictac,y_tictac,cv=10,scoring='accuracy'))

features selected:  13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9823229788388497

# Twonorm

In [116]:
df_twonorm = pd.read_csv(os.path.join(PATH,"twonorm/twonorm.csv"))

y_twonorm = df_twonorm['Class']
df_twonorm = df_twonorm.drop(['Class'],axis=1)
y_twonorm = y_twonorm.map({1:0,2:1})

X_train_twonorm,X_test_twonorm,y_train_twonorm,y_test_twonorm = train_test_split(df_twonorm,y_twonorm,random_state=42)

In [117]:
dt_clf_twonorm  = DecisionTreeClassifier(random_state=42)
np.mean(cross_val_score(dt_clf_twonorm,df_twonorm,y_twonorm,cv=10,scoring='accuracy'))

0.8377012568330849

In [118]:
svm_clf_twonorm = SVC(random_state=42,kernel='rbf',gamma='scale')
np.mean(cross_val_score(svm_clf_twonorm,df_twonorm,y_twonorm,cv=10,scoring='accuracy'))

0.9767547220759243

In [119]:
dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
sel_twonorm = SelectFromModel(dt_clf_twonorm,prefit=True)
# sel_twonorm.fit(df_twonorm,y_twonorm)
print("features selected: ",sum(sel_twonorm.get_support()))
selected_feat_df_twonorm = df_twonorm.loc[:,sel_twonorm.get_support()]
# dt_clf_twonorm.fit(X_train_twonorm,y_train_twonorm)
selected_feat_df_twonorm['dt_output']=dt_clf_twonorm.predict(df_twonorm)
np.mean(cross_val_score(svm_clf_twonorm,selected_feat_df_twonorm,y_twonorm,cv=10,scoring='accuracy'))

features selected:  7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


0.9636495200201489