# 1. 데이터 로드

In [1]:
import pandas as pd

train = pd.read_csv("train.csv")
print(train.head(n=10)) # 상위 열개만 출력
print(train.describe()) # 주요 통계 지표 반환 - 데이터 개수, 평균, 표준 편차, 최솟값

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

# 데이터 전처리

In [2]:
train['Age'] = train['Age'].fillna(train['Age'].mean())

# 사용할 feature들만 추출
features = ['Sex', 'Pclass', 'SibSp', 'Age', 'Fare']
X = pd.get_dummies(train[features], drop_first=True)
X['Sex_male'] = X['Sex_male'].astype(int)
print("X: \n", X)
print('-'*100)

y = train['Survived']
print("y: \n", y)


X: 
      Pclass  SibSp        Age     Fare  Sex_male
0         3      1  22.000000   7.2500         1
1         1      1  38.000000  71.2833         0
2         3      0  26.000000   7.9250         0
3         1      1  35.000000  53.1000         0
4         3      0  35.000000   8.0500         1
..      ...    ...        ...      ...       ...
886       2      0  27.000000  13.0000         1
887       1      0  19.000000  30.0000         0
888       3      1  29.699118  23.4500         0
889       1      0  26.000000  30.0000         1
890       3      0  32.000000   7.7500         1

[891 rows x 5 columns]
----------------------------------------------------------------------------------------------------
y: 
 0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


# Decision Tree Node

In [3]:
class Node:
		def __init__(self, gini, n_samples, n_samples_per_class, predicted_class):
				self.gini = gini
				self.n_samples = n_samples
				self.n_samples_per_class = n_samples_per_class
				self.predicted_class = predicted_class
				self.feature_index = -1
				self.threshold = -1
				self.left = None
				self.right = None

In [4]:
import numpy as np

class DecisionTreeClassifier():
		def __init__(self, max_depth=3):
				self.max_depth = max_depth
				self.tree = None
		
		def fit(self, X, y):
				self.tree = self.build_tree(X, y, self.max_depth)
		
		def gini(self, y):
			n_samples = len(y)
			n_sample_per_class = np.bincount(y)  # 0의 개수와 1의 개수가 array로 반환된다.

			gini = 1.0
			for i in range(len(n_sample_per_class)):
				gini -= (n_sample_per_class[i] / n_samples)**2
			return gini
		
		def find_best_split(self, X, y):
			n_samples, n_features = X.shape  # 891, 5
			best_gini = 1.0
			best_feature = -1
			best_threshold = -1  # 임계값 

			for feature in range(n_features):
				thresholds = np.unique(X[:, feature])  # unique는 중복 제거 해주는 함수

				for threshold in thresholds:
					left_indices = X[:, feature] < threshold
					right_indices = ~left_indices

					left_gini = self.gini(y[left_indices])
					right_gini = self.gini(y[right_indices])

					gini = (left_gini * np.sum(left_indices) + right_gini * np.sum(right_indices)) / n_samples  # Weight Avg

					if gini < best_gini:
						best_gini = gini
						best_feature = feature
						best_threshold = threshold
			
			return best_feature, best_threshold
		
		def build_tree(self, X, y, depth):
			if depth < 0:
				return None
			
			n_samples_per_class = np.bincount(y)

			node = Node(
				gini=self.gini,
				n_samples=len(y),
				n_samples_per_class=n_samples_per_class,
				predicted_class=np.argmax(n_samples_per_class)  # 전체 배열에서 가장 높은 값을 가진 요소의 인덱스 배열을 반환
				)
			
			if node.gini == 0:
				return node
			
			best_feature, best_threshold = self.find_best_split(X, y)

			node.feature_index = best_feature
			node.threshold = best_threshold

			left_indices = X[:, best_feature] < best_threshold
			right_indices = ~left_indices

			node.left = self.build_tree(X[left_indices], y[left_indices], depth-1)
			node.right = self.build_tree(X[right_indices], y[right_indices], depth-1)

			return node
		
		def predict(self, X):
			return np.array([self.predict_one(x) for x in X])
		
		def predict_one(self, x):
			node = self.tree

			while node.left is not None:
				if x[node.feature_index] < node.threshold:
					node = node.left
				else:
					node = node.right
			
			return node.predicted_class



In [5]:
model = DecisionTreeClassifier()
model.fit(X.values, y.values)

(model.predict(X.values) == y.values).mean()

np.float64(0.8271604938271605)

In [6]:
print(X.values)
print(type(X.values))

[[ 3.          1.         22.          7.25        1.        ]
 [ 1.          1.         38.         71.2833      0.        ]
 [ 3.          0.         26.          7.925       0.        ]
 ...
 [ 3.          1.         29.69911765 23.45        0.        ]
 [ 1.          0.         26.         30.          1.        ]
 [ 3.          0.         32.          7.75        1.        ]]
<class 'numpy.ndarray'>


In [8]:
a = np.array([1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1])
print(np.bincount(a))

[5 9]
