In [3]:
import numpy as np

In [22]:
class DataGeneration:
    def __init__(self, file_path, split_rate):
        try:
            self.data = np.loadtxt(file_path, delimiter=',', dtype=np.float32)
        except Exception as err:
            print('[DataGeneration::generate()]  ', str(err))
            raise Exception(str(err))
            
        self.split_rate = split_rate
        
    def generate(self):
        np.random.shuffle(self.data)
        
        split_point = int(len(self.data)*self.split_rate)
        
        training_data = self.data[ :split_point]
        test_data = self.data[split_point: ]
    
        print("training_data.shape = ", training_data.shape, ", test_data.shape = ", test_data.shape)
        print("===============================================")
        
        self.__display_target_distribution()
        
        return training_data, test_data
    
    def __display_target_distribution(self):
        data_classification = [self.data[ : , -1: ], training_data[ : , -1: ], test_data[ : ,  -1: ]]
        data_name = ['original', 'training', 'test']
        
        for data_type in range(len(data_classification)):        
            unique, counts = np.unique(data_classification[data_type], return_counts=True)
            
            for index in range(len(unique)):
                print("unique number of {} data = ".format(data_name[data_type]), unique[index], "count = ", counts[index])
    
            for index in range(len(unique)):
                ratio = counts[index] / sum(counts)
            
                print("unique number of {} data = ".format(data_name[data_type]), unique[index], "ratio = ", round(ratio, 2), "%")
                
            print("===============================================")

In [23]:
try:
    data_obj = DataGeneration('./ThoracicSurgery.csv', 0.7)

    (training_data, test_data) = data_obj.generate()
    
except Exception as err:
    print('Exception Occur !!')
    print(str(err))

training_data.shape =  (329, 18) , test_data.shape =  (141, 18)
unique number of original data =  0.0 count =  400
unique number of original data =  1.0 count =  70
unique number of original data =  0.0 ratio =  0.85 %
unique number of original data =  1.0 ratio =  0.15 %
unique number of training data =  0.0 count =  280
unique number of training data =  1.0 count =  49
unique number of training data =  0.0 ratio =  0.85 %
unique number of training data =  1.0 ratio =  0.15 %
unique number of test data =  0.0 count =  120
unique number of test data =  1.0 count =  21
unique number of test data =  0.0 ratio =  0.85 %
unique number of test data =  1.0 ratio =  0.15 %
