In [36]:
import numpy as np

class DataFrame:

    def __init__(self, file_path):
        assert file_path is not None, 'Please inform the file path'
        self.__read_csv(file_path)
        
        '''dict for each column type by column'''
        self.__init_col_types()
        '''list for each column name by index'''
        self.__init_col_dict()        
        self.shape = (len(self.values), len(self.columns))

        self.factorized_cols = dict()

    def __getitem__(self, key):
        if type(key) == str:
            j = self.__get_col_index_by_key(key)
            return np.array(
                [self.values[i][j] for i in range(self.shape[0])],
                dtype=self.column_types[j]
            )

    def __init_col_dict(self):
        self.column_dict = dict()
        for i, c in enumerate(self.columns):
            self.column_dict[c] = i

    def __init_col_types(self):
        def get_value_type(v):
            try:
                int(v)
                return int
            except Exception:
                pass

            try:
                float(v)
                return float
            except Exception:
                pass

            return str

        self.column_types = []
        for j in range(len(self.columns)):
            type = get_value_type(self.values[0][j])
            self.column_types.append(type)
            for i in range(len(self.values)):
                self.values[i][j] = type(self.values[i][j])

    def __read_csv(self, file_path):
        with open(file_path, 'r') as f:
            lines = f.readlines()
        
        self.columns = lines[0].replace('\n', '').split(',')
        self.values = [l.replace('\n', '').split(',') for l in lines[1:]]


    def __get_col_index_by_key(self, key):
        key_type = type(key)
        if key_type == str:
            assert key in self.columns, 'Invalid key'
            return self.column_dict[key]
        elif key_type == int:
            return key
        else:
            raise Exception('Invalid key type')

    def factorize(self, key):
        col_i = self.__get_col_index_by_key(key)

        classes = set(self.values[i][col_i] for i in range(self.shape[0]))
        class_dict = dict()
        inverse_class_dict = dict()
        for i, c in enumerate(classes):
            class_dict[c] = i
            inverse_class_dict[i] = c
        self.factorized_cols[self.columns[col_i]] = inverse_class_dict

        for i in range(self.shape[0]):
            self.values[i][col_i] = class_dict[self.values[i][col_i]]
        self.column_types[col_i] = int


df = DataFrame(file_path='datasets/iris.csv')
df.factorize('class')

print(df.columns)
print(df['class'])

['ID', 'sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'class']
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [None]:
class DecisionTree:

    class Node:

        def __init__(self, X, y, columns):
            self.X, self.y, self.columns = X, y, columns
            
        def __init_col_dict(self):
            self.column_dict = dict()
            for i, c in enumerate(self.columns):
                self.column_dict[c] = i
        
        def entropy(self, column):
            if type(column) == str:
                col_i = self.column_dict[column]
            elif type(column) == int:
                col_i = column
            else:
                raise Exception('Invalid column type')

            
            
            
            
            

    def __init__(self, X, y):
        pass


        