In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import math
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

path = r"C:\Users\James\machinelearning\Datasets\MidtermDataset"

os.chdir(path)

cleandata = []

filename = os.listdir(path)

print(filename[0])

breast-cancer-wisconsin-dataset.txt


In [2]:
f = open(filename[0], 'r')
rawdata = f.readlines()

In [3]:
def cleanData(data):
    temp = []
    temp2 = []
    for lines in data:
        temp.append(lines.rstrip("\n"))
    for lines in temp:
        temp2.append(lines.split(","))
    
    for subject in temp2:
        temp3 = []
        for info in subject:
            temp = 0
            if info == '?':
                temp = 0
            else:
                temp = int(info)
            temp3.append(temp)
        cleandata.append(temp3)

def fillMissing(column, mean):
    if column == 0:
        output = mean
    else:
        output = column
    return output

def binarize(column):
    if column == 2:
        output = 0
    else:
        output = 1
    return output

In [4]:
cleanData(rawdata)

print(cleandata[0])

[1000025, 5, 1, 1, 1, 2, 1, 3, 1, 1, 2]


In [5]:
df = pd.DataFrame(cleandata, columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell size',
                                       'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
                                       'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'])
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
print(df['Bare Nuclei'].value_counts())

1     402
10    132
2      30
5      30
3      28
8      21
4      19
0      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64


In [7]:
print(df['Class'].value_counts())

2    458
4    241
Name: Class, dtype: int64


In [8]:
missing_mean = math.floor(df['Bare Nuclei'].mean())
print(missing_mean)

3


In [9]:
df['Bare Nuclei'] = df['Bare Nuclei'].apply(lambda x: fillMissing(x, missing_mean))
df['Class'] = df['Class'].apply(lambda x: binarize(x))

In [10]:
print(df['Bare Nuclei'].value_counts())

1     402
10    132
3      44
2      30
5      30
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64


In [11]:
print(df['Class'].value_counts())

0    458
1    241
Name: Class, dtype: int64


In [12]:
print(df['Clump Thickness'].value_counts())
print(df['Uniformity of Cell size'].value_counts())
print(df['Uniformity of Cell Shape'].value_counts())
print(df['Marginal Adhesion'].value_counts())
print(df['Single Epithelial Cell Size'].value_counts())
print(df['Bare Nuclei'].value_counts())
print(df['Bland Chromatin'].value_counts())
print(df['Normal Nucleoli'].value_counts())
print(df['Mitoses'].value_counts())
print(df['Class'].value_counts())

1     145
5     130
3     108
4      80
10     69
2      50
8      46
6      34
7      23
9      14
Name: Clump Thickness, dtype: int64
1     384
10     67
3      52
2      45
4      40
5      30
8      29
6      27
7      19
9       6
Name: Uniformity of Cell size, dtype: int64
1     353
2      59
10     58
3      56
4      44
5      34
6      30
7      30
8      28
9       7
Name: Uniformity of Cell Shape, dtype: int64
1     407
3      58
2      58
10     55
4      33
8      25
5      23
6      22
7      13
9       5
Name: Marginal Adhesion, dtype: int64
2     386
3      72
4      48
1      47
6      41
5      39
10     31
8      21
7      12
9       2
Name: Single Epithelial Cell Size, dtype: int64
1     402
10    132
3      44
2      30
5      30
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64
2     166
3     165
1     152
7      73
4      40
5      34
8      28
10     20
9      11
6      10
Name: Bland Chromatin, dtype: int64
1     443
10     61
3

In [13]:
df_pruned = df.drop(['Clump Thickness', 'Marginal Adhesion', 'Single Epithelial Cell Size',
                    'Bland Chromatin', 'Mitoses'], axis=1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample code number           699 non-null    int64
 1   Clump Thickness              699 non-null    int64
 2   Uniformity of Cell size      699 non-null    int64
 3   Uniformity of Cell Shape     699 non-null    int64
 4   Marginal Adhesion            699 non-null    int64
 5   Single Epithelial Cell Size  699 non-null    int64
 6   Bare Nuclei                  699 non-null    int64
 7   Bland Chromatin              699 non-null    int64
 8   Normal Nucleoli              699 non-null    int64
 9   Mitoses                      699 non-null    int64
 10  Class                        699 non-null    int64
dtypes: int64(11)
memory usage: 60.2 KB


In [15]:
df_pruned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Sample code number        699 non-null    int64
 1   Uniformity of Cell size   699 non-null    int64
 2   Uniformity of Cell Shape  699 non-null    int64
 3   Bare Nuclei               699 non-null    int64
 4   Normal Nucleoli           699 non-null    int64
 5   Class                     699 non-null    int64
dtypes: int64(6)
memory usage: 32.9 KB


In [16]:
X = df.drop(['Class', 'Sample code number'], axis=1)
X_pruned = df_pruned.drop(['Class', 'Sample code number'], axis=1)

y = df['Class'].copy()
y_pruned = df_pruned['Class'].copy()

In [17]:
X.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [18]:
X_pruned.head()

Unnamed: 0,Uniformity of Cell size,Uniformity of Cell Shape,Bare Nuclei,Normal Nucleoli
0,1,1,1,1
1,4,4,10,2
2,1,1,2,1
3,8,8,4,7
4,1,1,1,1


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train_pruned, X_test_pruned, y_train_pruned, y_test_pruned = train_test_split(X_pruned, y_pruned, test_size=0.3, random_state=42)

In [20]:
# Non-pruned
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

# Pruned
tree_reg_pruned = DecisionTreeRegressor()
tree_reg_pruned.fit(X_train_pruned, y_train_pruned)

DecisionTreeRegressor()

In [21]:
# Test set predictions
initial_predictions_test = tree_reg.predict(X_test)

In [22]:
# Test set predictions: pruned
initial_predictions_test_pruned = tree_reg_pruned.predict(X_test_pruned)

In [23]:
# Non-pruned test rmse
tree_mse = mean_squared_error(y_test, initial_predictions_test)
tree_rmse = np.sqrt(tree_mse)
print("Test rmse:")
print(tree_rmse)

Test rmse:
0.23904572186687872


In [24]:
# Pruned test rmse
tree_mse_pruned = mean_squared_error(y_test_pruned, initial_predictions_test_pruned)
tree_rmse_pruned = np.sqrt(tree_mse_pruned)
print("Test rmse:")
print(tree_rmse_pruned)

Test rmse:
0.2342950701448494


In [25]:
# Train set predictions
initial_predictions_train = tree_reg.predict(X_train)

In [26]:
# Train set predictions: pruned
initial_predictions_train_pruned = tree_reg_pruned.predict(X_train_pruned)

In [27]:
# Non-pruned train rmse
tree_mse_train = mean_squared_error(y_train, initial_predictions_train)
tree_rmse_train = np.sqrt(tree_mse_train)
print("Train rmse:")
print(tree_rmse_train)

Train rmse:
0.0


In [28]:
# Pruned train rmse
tree_mse_train_pruned = mean_squared_error(y_train_pruned, initial_predictions_train_pruned)
tree_rmse_train_pruned = np.sqrt(tree_mse_train_pruned)
print("Train rmse:")
print(tree_rmse_train_pruned)

Train rmse:
0.061230285977831776


In [29]:
print("INITIAL ACCURACY SCORES")

INITIAL ACCURACY SCORES


In [30]:
print("Test accuracy score:")
print(np.mean(initial_predictions_test == y_test))

Test accuracy score:
0.9428571428571428


In [31]:
print("Train accuracy score:")
print(np.mean(initial_predictions_train == y_train))

Train accuracy score:
1.0


In [32]:
print("Pruned test accuracy score:")
print(np.mean(initial_predictions_test_pruned == y_test_pruned))

Pruned test accuracy score:
0.9333333333333333


In [33]:
print("Pruned train accuracy score:")
print(np.mean(initial_predictions_train_pruned == y_train_pruned))

Pruned train accuracy score:
0.9795501022494888


In [34]:
scores = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=15)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [35]:
display_scores(tree_rmse_scores)

Scores: [0.17407766 0.24618298 0.24618298 0.38924947 0.24618298 0.24618298
 0.30151134 0.         0.34815531 0.30618622 0.1767767  0.25
 0.35355339 0.30618622 0.25      ]
Mean: 0.25602854892973237
Standard deviation: 0.09013115197964608


In [36]:
scores_pruned = cross_val_score(tree_reg_pruned, X_train_pruned, y_train_pruned, scoring="neg_mean_squared_error", cv=15)
tree_rmse_scores_pruned = np.sqrt(-scores_pruned)

In [37]:
display_scores(tree_rmse_scores_pruned)

Scores: [0.17407766 0.24618298 0.17407766 0.24618298 0.24618298 0.17407766
 0.30772873 0.         0.1        0.35355339 0.26516504 0.1767767
 0.43445368 0.03535534 0.25      ]
Mean: 0.21225431926734561
Standard deviation: 0.10975982923625509


In [38]:
param_grid = [
    {
    'criterion': ['absolute_error'],
    'splitter': ['best'],
    'max_depth': [6],
    'min_samples_split': [2],
    'min_samples_leaf': [6],
    'min_weight_fraction_leaf': [0],
    'max_features': [None],
    'random_state':[None],
    'max_leaf_nodes': [None],
    'min_impurity_decrease': [0],
    'ccp_alpha': [0]
    },

]

#Non-pruned
grid_search = GridSearchCV(tree_reg, param_grid, cv=5,
    return_train_score=True)

#Pruned
grid_search_pruned = GridSearchCV(tree_reg_pruned, param_grid, cv=5,
    return_train_score=True)

grid_search.fit(X_train, y_train)
grid_search_pruned.fit(X_train_pruned, y_train_pruned)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid=[{'ccp_alpha': [0], 'criterion': ['absolute_error'],
                          'max_depth': [6], 'max_features': [None],
                          'max_leaf_nodes': [None],
                          'min_impurity_decrease': [0], 'min_samples_leaf': [6],
                          'min_samples_split': [2],
                          'min_weight_fraction_leaf': [0],
                          'random_state': [None], 'splitter': ['best']}],
             return_train_score=True)

In [39]:
#grid_search.best_params_

In [40]:
grid_search.best_score_

0.7147929588183979

In [41]:
grid_search_pruned.best_score_

0.7411012388442877

In [42]:
grid_predicted_test = grid_search.predict(X_test)
print("GridSearch prediction on testing set:")
print(np.mean(grid_predicted_test == y_test))

GridSearch prediction on testing set:
0.9523809523809523


In [43]:
grid_predicted_test_pruned = grid_search_pruned.predict(X_test_pruned)
print("GridSearch prediction on pruned testing set:")
print(np.mean(grid_predicted_test_pruned == y_test_pruned))

GridSearch prediction on pruned testing set:
0.9523809523809523


In [44]:
grid_predicted_train = grid_search.predict(X_train)
print("GridSearch prediction on training set:")
print(np.mean(grid_predicted_train == y_train))

GridSearch prediction on training set:
0.9591002044989775


In [45]:
grid_predicted_train_pruned = grid_search_pruned.predict(X_train_pruned)
print("GridSearch prediction on pruned training set:")
print(np.mean(grid_predicted_train_pruned == y_train_pruned))

GridSearch prediction on pruned training set:
0.9591002044989775


In [46]:
best_tree = grid_search.best_estimator_
best_tree_pruned = grid_search_pruned.best_estimator_

In [47]:
best_feats = best_tree.feature_importances_
best_feats_pruned = best_tree_pruned.feature_importances_

columns = X_train.columns
columns_pruned = X_train_pruned.columns

In [48]:
final_predictors_train = best_tree.predict(X_train)
final_tree_mse_train = mean_squared_error(y_train, final_predictors_train)
final_tree_rmse_train = np.sqrt(final_tree_mse_train)
print('Best tree on train set RMSE:', final_tree_rmse_train) 

Best tree on train set RMSE: 0.20223697856975242


In [49]:
final_predictors_train_pruned = best_tree_pruned.predict(X_train_pruned)
final_tree_mse_train_pruned = mean_squared_error(y_train_pruned, final_predictors_train_pruned)
final_tree_rmse_train_pruned = np.sqrt(final_tree_mse_train_pruned)
print('Best tree on pruned train set RMSE:', final_tree_rmse_train_pruned) 

Best tree on pruned train set RMSE: 0.20223697856975242


In [50]:
print("BEST NON-PRUNED PRECISION SCORES OF TRAIN SET:")
micro_precision_train = precision_score(final_predictors_train, y_train, average='micro')
print('Micro-averaged precision score: {0:0.2f}'.format(
      micro_precision_train))

macro_precision_train = precision_score(final_predictors_train, y_train, average='macro')
print('Macro-averaged precision score: {0:0.2f}'.format(
      macro_precision_train))

per_class_precision_train = precision_score(final_predictors_train, y_train, average=None)
print('Per-class precision score:', per_class_precision_train)

BEST NON-PRUNED PRECISION SCORES OF TRAIN SET:
Micro-averaged precision score: 0.96
Macro-averaged precision score: 0.96
Per-class precision score: [0.95555556 0.96551724]


In [51]:
print("BEST PRUNED PRECISION SCORES OF TRAIN SET:")
micro_precision_train_pruned = precision_score(final_predictors_train_pruned, y_train_pruned, average='micro')
print('Micro-averaged precision score: {0:0.2f}'.format(
      micro_precision_train_pruned))

macro_precision_train_pruned = precision_score(final_predictors_train_pruned, y_train_pruned, average='macro')
print('Macro-averaged precision score: {0:0.2f}'.format(
      macro_precision_train_pruned))

per_class_precision_train_pruned = precision_score(final_predictors_train_pruned, y_train_pruned, average=None)
print('Per-class precision score:', per_class_precision_train_pruned)

BEST PRUNED PRECISION SCORES OF TRAIN SET:
Micro-averaged precision score: 0.96
Macro-averaged precision score: 0.96
Per-class precision score: [0.95555556 0.96551724]


In [52]:
final_predictors_test = best_tree.predict(X_test)
final_tree_mse_test = mean_squared_error(y_test, final_predictors_test )
final_tree_rmse_test = np.sqrt(final_tree_mse_test )

print('Best tree on test set RMSE:', final_tree_rmse_test) 

Best tree on test set RMSE: 0.21821789023599236


In [53]:
final_predictors_test_pruned = best_tree_pruned.predict(X_test_pruned)
final_tree_mse_test_pruned = mean_squared_error(y_test_pruned, final_predictors_test_pruned)
final_tree_rmse_test_pruned = np.sqrt(final_tree_mse_test_pruned)

print('Best tree on pruned test set RMSE:', final_tree_rmse_test_pruned) 

Best tree on pruned test set RMSE: 0.21821789023599236


In [54]:
print("BEST NON-PRUNED PRECISION SCORES OF TRAIN SET:")
micro_precision_test = precision_score(final_predictors_test, y_test, average='micro')
print('Micro-averaged precision score: {0:0.2f}'.format(
      micro_precision_test))

macro_precision_test = precision_score(final_predictors_test, y_test, average='macro')
print('Macro-averaged precision score: {0:0.2f}'.format(
      macro_precision_test))

per_class_precision_test = precision_score(final_predictors_test, y_test, average=None)
print('Per-class precision score:', per_class_precision_test)

BEST NON-PRUNED PRECISION SCORES OF TRAIN SET:
Micro-averaged precision score: 0.95
Macro-averaged precision score: 0.96
Per-class precision score: [0.94405594 0.97014925]


In [55]:
print("BEST PRUNED PRECISION SCORES OF TRAIN SET:")
micro_precision_test_pruned = precision_score(final_predictors_test_pruned, y_test_pruned, average='micro')
print('Micro-averaged precision score: {0:0.2f}'.format(
      micro_precision_test_pruned))

macro_precision_test_pruned = precision_score(final_predictors_test_pruned, y_test_pruned, average='macro')
print('Macro-averaged precision score: {0:0.2f}'.format(
      macro_precision_test_pruned))

per_class_precision_test_pruned = precision_score(final_predictors_test_pruned, y_test_pruned, average=None)
print('Per-class precision score:', per_class_precision_test_pruned)

BEST PRUNED PRECISION SCORES OF TRAIN SET:
Micro-averaged precision score: 0.95
Macro-averaged precision score: 0.96
Per-class precision score: [0.94405594 0.97014925]


In [56]:
importances_table = pd.Series(best_feats, columns)
importances_table

Clump Thickness                0.000000
Uniformity of Cell size        0.006494
Uniformity of Cell Shape       0.902597
Marginal Adhesion              0.000000
Single Epithelial Cell Size    0.000000
Bare Nuclei                    0.077922
Bland Chromatin                0.000000
Normal Nucleoli                0.012987
Mitoses                        0.000000
dtype: float64

In [57]:
importances_table_pruned = pd.Series(best_feats_pruned, columns_pruned)
importances_table_pruned

Uniformity of Cell size     0.006494
Uniformity of Cell Shape    0.902597
Bare Nuclei                 0.077922
Normal Nucleoli             0.012987
dtype: float64

In [58]:
from sklearn.tree import export_graphviz
from subprocess import check_call

export_graphviz(
    best_tree,
    out_file="cancer.dot",
    feature_names=X_train.columns,
    rounded=True,
    filled=True
)

check_call(['dot','-Tpng','cancer.dot','-o','cancer.png'])

0

In [59]:
export_graphviz(
    best_tree_pruned,
    out_file="pruned-cancer.dot",
    feature_names=X_train_pruned.columns,
    rounded=True,
    filled=True
)

check_call(['dot','-Tpng','pruned-cancer.dot','-o','pruned-cancer.png'])

0