### 1. Load Libraries

In [1]:
import pandas as pd
import src.util as utils
import joblib
from sklearn.model_selection import train_test_split

### 2. Load Config

In [2]:
config = utils.load_config()

### 3. Load Dataset

In [3]:
data = pd.read_csv(config['dataset_path'], names = config['column_name'], header= None)
data

Unnamed: 0,Sample_code_number,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,2
695,841769,2,1,1,1,2,1.0,1,1,1,2
696,888820,5,10,10,3,7,3.0,8,10,2,4
697,897471,4,8,6,4,3,4.0,10,6,1,4


In [4]:
data = data[config['used_columns']]

In [5]:
data

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,2
695,2,1,1,1,2,1.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


### 4. Data Validation

In [6]:
#drop duplicates
data = data.drop_duplicates()

In [7]:
data

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
693,3,1,1,1,2,1.0,2,1,2,2
694,3,1,1,1,3,2.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


In [8]:
data.isnull().sum()

Clump_thickness                 0
Uniformity_of_cell_size         0
Uniformity_of_cell_shape        0
Marginal_adhesion               0
Single_epithelial_cell_size     0
Bare_nuclei                    14
Bland_chromatin                 0
Normal_nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [9]:
data.dtypes

Clump_thickness                  int64
Uniformity_of_cell_size          int64
Uniformity_of_cell_shape         int64
Marginal_adhesion                int64
Single_epithelial_cell_size      int64
Bare_nuclei                    float64
Bland_chromatin                  int64
Normal_nucleoli                  int64
Mitoses                          int64
Class                            int64
dtype: object

In [10]:
data.describe()

Unnamed: 0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
count,463.0,463.0,463.0,463.0,463.0,449.0,463.0,463.0,463.0,463.0
mean,5.323974,4.174946,4.239741,3.691145,3.840173,4.806236,4.170626,3.803456,1.885529,3.028078
std,2.873752,3.239645,3.121767,3.15228,2.446914,3.880509,2.639002,3.385452,2.043326,1.000687
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,3.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,5.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,1.0,4.0
75%,8.0,7.0,7.0,6.0,5.0,10.0,7.0,7.0,2.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


### 5. Data Defense

In [11]:
def data_checking(input_data, config):
    # check input length
    input_length = len(input_data)
    
    #check input data type
    assert input_data.select_dtypes("int").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."
    
    #check data range
    assert input_data[config["int_columns"][0]].between(config["Clump_thickness"][0], config["Clump_thickness"][1]).sum() == input_length, "an error occurs in Clump thickness range."
    assert input_data[config["int_columns"][1]].between(config["Uniformity_of_cell_size"][0], config["Uniformity_of_cell_size"][1]).sum() == input_length, "an error occurs in Uniformity of cell size range."
    assert input_data[config["int_columns"][2]].between(config["Uniformity_of_cell_shape"][0], config["Uniformity_of_cell_shape"][1]).sum() == input_length, "an error occurs in Uniformity of cell shape range."
    assert input_data[config["int_columns"][3]].between(config["Marginal_adhesion"][0], config["Marginal_adhesion"][1]).sum() == input_length, "an error occurs in Marginal adhesion range."
    assert input_data[config["int_columns"][4]].between(config["Single_epithelial_cell_size"][0], config["Single_epithelial_cell_size"][1]).sum() == input_length, "an error occurs in Single epithelial cell size range."
    assert input_data[config["float_columns"][0]].between(config["Bare_nuclei"][0], config["Bare_nuclei"][1]).sum() == input_length, "an error occurs in Bare nuclei range."
    assert input_data[config["int_columns"][6]].between(config["Bland_chromatin"][0], config["Bland_chromatin"][1]).sum() == input_length, "an error occurs in Bland chromatin range."
    assert input_data[config["int_columns"][7]].between(config["Normal_nucleoli"][0], config["Normal_nucleoli"][1]).sum() == input_length, "an error occurs in Normal nucleoli range."
    assert input_data[config["int_columns"][8]].between(config["Mitoses"][0], config["Mitoses"][1]).sum() == input_length, "an error occurs in Mitoses range."

### 6. Data Splitting

In [35]:
x = pd.DataFrame(data[config["predictors"]])
y = pd.DataFrame(data[config["label"]])

In [37]:
y


Unnamed: 0,Class
0,2
1,2
2,2
3,2
4,2
...,...
693,2
694,2
696,4
697,4


In [38]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123, stratify = y)

In [39]:
utils.pickle_dump(x_train[config["predictors"]], config["traintest_set_path"][0])
utils.pickle_dump(y_train[config["label"]], config["traintest_set_path"][1])

utils.pickle_dump(x_test[config["predictors"]], config["traintest_set_path"][2])
utils.pickle_dump(y_test[config["label"]], config["traintest_set_path"][3])