Data from:
https://archive.ics.uci.edu/ml/datasets/Lymphography

In [19]:
import numpy as np
np.random.seed(42)

In [20]:
import pandas as pd
arquivo = 'lymphography.data'
col_names = [
    'class',
    'ymphaticsormal',
    'block of affere',
    'bl. of lymph. c',
    'bl. of lymph. s',
    'by pass',
    'extravasates',
    'regeneration of',
    'early uptake in',
    'lym.nodes dimin',
    'lym.nodes enlar',
    'changes in lym.',
    'defect in node',
    'changes in node',
    'changes in stru',
    'special forms',
    'dislocation of',
    'exclusion of no',
    'no. of nodes in'
]
df = pd.read_csv(arquivo, names=col_names)

In [21]:
df['class'] = df['class'].replace([1, 2, 3, 4], ['normal find', 'metastases', 'malign lymph', 'fibrosis'])

In [22]:
y = df['class']

In [23]:
X = df.drop('class', axis=1)

In [24]:
X_d = pd.get_dummies(df, columns = [
    'ymphaticsormal',
    'block of affere',
    'bl. of lymph. c',
    'bl. of lymph. s',
    'by pass',
    'extravasates',
    'regeneration of',
    'early uptake in',
    'lym.nodes dimin',
    'lym.nodes enlar',
    'changes in lym.',
    'defect in node',
    'changes in node',
    'changes in stru',
    'special forms',
    'dislocation of',
    'exclusion of no',
    'no. of nodes in'
]).drop('class', axis=1)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [26]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [27]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [28]:
model.score(X_test, y_test)

0.7

In [29]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_d, y, test_size = 0.2)

In [30]:
model2 = DecisionTreeClassifier()
model2.fit(X_train2, y_train2)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [31]:
model2.score(X_test2, y_test2)

0.8

In [32]:
from sklearn.model_selection import cross_validate
cv_score = cross_validate(model, X, y, cv=5)
cv_score2 = cross_validate(model, X_d, y, cv=5)



In [33]:
cv_score['test_score'].mean()

0.7429885057471264

In [34]:
cv_score2['test_score'].mean()

0.8035509031198688

In [35]:
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(n_estimators=100, max_depth=2)
model3.fit(X_train2, y_train2)
cv_score3 = cross_validate(model3, X_d, y, cv=5)
cv_score3['test_score'].mean()



0.8184667487684729

In [36]:
import xgboost as xgb
model4 = xgb.XGBClassifier()
model4.fit(X_train2, y_train2)
cv_score4 = cross_validate(model4, X_d, y, cv=5)
cv_score4['test_score'].mean()



0.8369704433497537