## HW3: Decision Tree, AdaBoost and Random Forest
In hw3, you need to implement decision tree, adaboost and random forest by using only numpy, then train your implemented model by the provided dataset. TA will use the on-hold test label to evaluate your model performance.

Please note that only **NUMPY** can be used to implement your model, you will get no points by simply calling `sklearn.tree.DecisionTreeClassifier`

## Question 1
Gini Index or Entropy is often used for measuring the “best” splitting of the data. Please compute the Entropy and Gini Index of provided data. Please use the formula from [page 5 of hw3 slides](https://docs.google.com/presentation/d/1kIe_-YZdemRMmr_3xDy-l0OS2EcLgDH7Uan14tlU5KE/edit#slide=id.gd542a5ff75_0_15)

In [None]:
# Copy and paste your implementations right here to check your result
# (Of course you can add your classes not written here)
# return the probability of each classes
import numpy as np


def ParseClasses(sequence):
    """Find there are how many classes in the dataset and calculate the probability of each class

    Args:
        sequence (1-d array): contains the class label of every data

    Returns:
        dictionary: key: class name, value: probability
    """
    class_seq = np.unique(sequence)
    kclasses = class_seq.shape[0]
    prob = {x: (np.count_nonzero(sequence==x) / sequence.shape[0]) for x in class_seq}
    print(f"There are {kclasses} classes in this dataset")
    print("The class labels and the probability of each class is")
    print(prob)
    return prob
    
def gini(sequence):
    # Class_stat = ParseClasses(sequence=np.array(sequence))
    class_seq, cnt = np.unique(sequence, return_counts=True)
    p = cnt / sequence.shape[0]
    return 1 - np.sum(p ** 2)


def entropy(sequence):
    # Class_stat = ParseClasses(sequence=np.array(sequence))
    # prob = np.fromiter(Class_stat.values(), dtype=float)
    class_seq, cnt = np.unique(sequence, return_counts=True)
    p = cnt / sequence.shape[0]
    return (-1) * np.sum(np.where(p == 0, 0, p * np.log2(p)))

In [None]:
# 1 = class 1,
# 2 = class 2
data = np.array([1,2,1,1,1,1,2,2,1,1,2])

In [None]:
print("Gini of data is ", gini(data))

In [None]:
print("Entropy of data is ", entropy(data))

## Load data
It is a binary classifiation dataset that classify if price is high or not for a cell phone, the label is stored in `price_range` column.

In [None]:
import pandas as pd

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
print(train_df.shape)
print(val_df.shape)
train_df.head()
x_train = np.array(train_df)
x_test = np.array(val_df)

In [None]:
ParseClasses(x_train[:,x_train.shape[1]-1])

## Question 2
Implement the Decision Tree algorithm (CART, Classification and Regression Trees) and trained the model by the given arguments, and print the accuracy score on the validation data. You should implement two arguments for the Decision Tree algorithm
1. **criterion**: The function to measure the quality of a split. Your model should support `gini` for the Gini impurity and `entropy` for the information gain. 
2. **max_depth**: The maximum depth of the tree. If `max_depth=None`, then nodes are expanded until all leaves are pure. `max_depth=1` equals to split data once


In [None]:
class Question():
    def __init__(self, column, value):
        self.column = column
        self.value = value
        pass

    def match(self, row):
        val = row[self.column]
        return val >= self.value


def partition(x_data, question):
    true_rows, false_rows = [], []
    for row in x_data:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows


class DecisionTree():
    def __init__(self, criterion='gini', max_depth=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.n_features = None
        self.root = None
        self.feature_count = {}
        if criterion == 'gini':
            self.measureFunc = gini
        else:
            self.measureFunc = entropy
        return None
    class TreeNode():
        def __init__(self, rows, gain, question):
            self.rows = rows
            self.gain = gain
            self.question = question
            self.pred = None
            self.left = None
            self.right = None
            return None
        def __init__(self):
            self.rows = None
            self.gain = None
            self.question = None
            self.pred = None
            self.left = None
            self.right = None
            return None
        def print_node_info(self):
            print(f'The node\'s depth is {self.depth}, impurity is {self.impurity}')
            print(f'Question is {self.question.column}, and threshold is {self.question.value}')
            print(f'{self.left}, {self.right}')
            return None
    def Informationgain(self, left_rows, right_rows, currentImpurity):
        p = float(len(left_rows)) / (len(left_rows) + len(right_rows))
        return currentImpurity - p * self.measureFunc(left_rows[:, -1].astype(np.int32)) - (1 - p) * self.measureFunc(right_rows[:, -1].astype(np.int32))

    def find_best_split(self, rows):
        """Find the best split by repeating asking whether a property of a data is greater than thresholds
        generated by sorting N data using each property

        Args:
            rows (N,21): includes 20 properties and 1 columns representing the class of each row.
        """
        best_gain = 0
        best_question = None
        left_rows = None
        right_rows = None
        current_impurity = self.measureFunc(rows[:, -1].astype(np.int32))

        # for each feature
        for col in range(len(rows[0])-1):
            # sort data using values in column `col`
            # extract the data sorted using current feature
            col_sort = rows[np.argsort(rows[:, col])]

            # Try N-1 threshold values
            for idx in range(len(col_sort)-1):
                # i-th and i+1-th sorted value as current threshold
                current_threshold = (col_sort[idx, col] + col_sort[idx+1, col]) / 2.0

                # is data[col] >= current_threshold ?
                # if it's binary, the threshold is 0.5, so it's ok to use '>=' to compare
                # if it's real value, also valid to use '>=' to compare
                question = Question(column=col,
                                    value=current_threshold)

                # split the data using current question
                # true and false are candidates for best split(potential child nodes)
                true_rows  = col_sort[col_sort[:, col] >= current_threshold]
                false_rows = col_sort[col_sort[:, col] <  current_threshold]

                # # Pick the split that maximize information gain
                current_gain = self.Informationgain(left_rows=true_rows,
                                                        right_rows=false_rows,
                                                        currentImpurity=current_impurity)
                # print(current_gain)
                if current_gain >= best_gain:
                    best_gain, best_question = current_gain, question
        label = train_df.columns[best_question.column]
        if self.feature_count.get(label) is not None:
            self.feature_count[label] = self.feature_count[label] + 1
        else:
            self.feature_count[label] = 1
        return best_gain, best_question
    def get_feature_count(self):
        print(self.feature_count)
        return 
    def generateTree(self, rows, cur_depth=None):
        cur_node = self.TreeNode()
        if self.measureFunc(sequence=rows[:, -1]) == 0:
            cur_node.pred = int(rows[0, -1])
        elif cur_depth == 0:
            if np.count_nonzero(rows[:, -1]) >= rows.shape[0] / 2:
                cur_node.pred = 1
            else:
                cur_node.pred = 0
        else:
            best_gain, best_question = self.find_best_split(rows=rows)
            cur_node.rows = rows
            cur_node.gain = self.measureFunc(rows[:, -1].astype(np.int32))
            cur_node.question = best_question
            left_child = rows[rows[:, best_question.column]>=best_question.value]
            right_child = rows[rows[:, best_question.column]<best_question.value]
            if cur_depth is None:
                cur_node.left = self.generateTree(rows=left_child)
                cur_node.right = self.generateTree(rows=right_child)
            else:
                cur_node.left = self.generateTree(rows=left_child, cur_depth=cur_depth-1)
                cur_node.right = self.generateTree(rows=right_child, cur_depth=cur_depth-1)
        return cur_node
    # Generate Tree by fitting data
    def fit(self, x_data, y_data):
        self.feature_count = {}
        y_data = y_data[:, np.newaxis]
        rows = np.hstack((x_data, y_data))
        self.n_features = len(x_data[0]) - 1
        self.root = self.generateTree(rows=rows, cur_depth=self.max_depth)
    # After fitting, use the gererated tree to predict x_data
    def feature_importance(self):
        fi = []
        for key in self.feature_count.keys():
            fi.append(self.feature_count[key])
        return fi
    def traverse(self, cur_node, x_data):
        if cur_node is None:
            return 
        if cur_node.question is None:
            return cur_node.pred
        if cur_node.question.match(x_data) == 1:
            return self.traverse(cur_node=cur_node.left, x_data=x_data)
        else:
            return self.traverse(cur_node=cur_node.right, x_data=x_data)
    def print_acc(self, acc):
        print(f'criterion = {self.criterion}')
        print(f'max depth = {self.max_depth}')
        print(f'acc       = {acc}')
        print('====================')
    
    def predict(self, x_data):
        miss_num = 0
        total_num = len(x_data)
        pred = []
        for row in x_data:
            ans = self.traverse(cur_node=self.root, x_data=row)
            # print(f'cmp {ans} and {row[-1]}')
            pred.append(ans)
            if ans != row[-1]:
                miss_num = miss_num + 1
        accuracy = 1 - (miss_num / total_num)
        self.print_acc(acc=accuracy)
        return accuracy, pred


### Question 2.1
Using `criterion=gini`, showing the accuracy score of validation data by `max_depth=3` and `max_depth=10`, respectively.


In [None]:
# my_train = np.random.randint(50, size=(10,5))
# my_train = np.hstack((my_train, np.random.randint(2, size=(10, 1))))
# print(my_train)
# clf_test3 = DecisionTree(criterion='gini', max_depth=3)
# clf_test3.fit(x_data=my_train[:, 0:5], y_data=my_train[:, -1])
# clf_test3.print_tree(clf_test3.root, '')
# clf_test3.get_feature_count()
# x_test = np.array(my_train)
# print(clf_test3.predict(x_test))
# x = np.array([5,3,2,1])
# x = x[np.newaxis, :]
# print(x[:, -1])
clf_depth3 = DecisionTree(criterion='gini', max_depth=3)
clf_depth3.fit(x_data=x_train[:, 0:20], y_data=x_train[:, -1])
# clf_depth3.print_tree(clf_depth3.root, '')
clf_depth3.get_feature_count()
acc, _ = clf_depth3.predict(x_test)

clf_depth10 = DecisionTree(criterion='gini', max_depth=10)
clf_depth10.fit(x_data=x_train[:, 0:20], y_data=x_train[:, -1])
# clf_depth10.print_tree(clf_depth3.root, '')
clf_depth10.get_feature_count()
acc, _ = clf_depth10.predict(x_test)

### Question 2.2
Using `max_depth=3`, showing the accuracy score of validation data by `criterion=gini` and `criterion=entropy`, respectively.


In [10]:
clf_gini = DecisionTree(criterion='gini', max_depth=None)
clf_gini.fit(x_data=x_train[:, 0:20], y_data=x_train[:, -1])
# clf_depth3.print_tree(clf_depth3.root, '')
clf_gini.get_feature_count()
acc, _ = clf_gini.predict(x_test)

clf_entropy = DecisionTree(criterion='entropy', max_depth=None)
clf_entropy.fit(x_data=x_train[:, 0:20], y_data=x_train[:, -1])
# clf_depth3.print_tree(clf_depth3.root, '')
clf_entropy.get_feature_count()
acc, _ = clf_entropy.predict(x_test)

{'ram': 9, 'battery_power': 8, 'px_height': 10, 'talk_time': 4, 'sc_h': 2, 'wifi': 3, 'px_width': 5, 'm_dep': 1, 'mobile_wt': 1, 'touch_screen': 2, 'sc_w': 1, 'fc': 1}
criterion = gini
max depth = None
acc       = 0.9433333333333334
{'ram': 11, 'battery_power': 6, 'sc_w': 3, 'px_width': 4, 'three_g': 2, 'px_height': 6, 'talk_time': 2, 'sc_h': 1, 'mobile_wt': 1, 'n_cores': 1, 'm_dep': 1}
criterion = entropy
max depth = None
acc       = 0.95


- Note: Your decisition tree scores should over **0.7**. It may suffer from overfitting, if so, you can tune the hyperparameter such as `max_depth`
- Note: You should get the same results when re-building the model with the same arguments,  no need to prune the trees
- Hint: You can use the recursive method to build the nodes


## Question 3
Plot the [feature importance](https://sefiks.com/2020/04/06/feature-importance-in-decision-trees/) of your Decision Tree model. You can get the feature importance by counting the feature used for splitting data.

- You can simply plot the **counts of feature used** for building tree without normalize the importance. Take the figure below as example, outlook feature has been used for splitting for almost 50 times. Therefore, it has the largest importance

![image](https://i2.wp.com/sefiks.com/wp-content/uploads/2020/04/c45-fi-results.jpg?w=481&ssl=1)

In [None]:
# ## Question 3
# Plot the [feature importance](https://sefiks.com/2020/04/06/feature-importance-in-decision-trees/) of your Decision Tree model. You can get the feature importance by counting the feature used for splitting data.
import matplotlib.pyplot as plt

feature_sorted = dict(sorted(clf_depth3.feature_count.items(), key=lambda item: item[1]))
feature_names = feature_sorted.keys()
print(feature_names)
feature_counts = feature_sorted.values()
print(feature_counts)

x_pos = [x for x,_ in enumerate(feature_names)]
plt.barh(x_pos, feature_counts, height=0.4)
plt.ylabel('feature names')
plt.xlabel('feature importance')
plt.xticks(np.arange(max(feature_counts)+1))
plt.yticks(x_pos, feature_names, rotation=45)
plt.gca().grid(axis='x', which='major')
plt.tight_layout()
plt.savefig('fi_gini_d3.png', dpi=300, transparent=False)
plt.clf()

feature_sorted = dict(sorted(clf_depth10.feature_count.items(), key=lambda item: item[1]))
feature_names = feature_sorted.keys()
print(feature_names)
feature_counts = feature_sorted.values()
print(feature_counts)

x_pos = [x for x,_ in enumerate(feature_names)]
plt.barh(x_pos, feature_counts, height=0.4)
plt.ylabel('feature names')
plt.xlabel('feature importance')
plt.xticks(np.arange(max(feature_counts)+1))
plt.yticks(x_pos, feature_names, rotation=45)
plt.gca().grid(axis='x', which='major')
plt.tight_layout()
plt.savefig('fi_gini_d10.png', dpi=300, transparent=False)
plt.clf()

feature_sorted = dict(sorted(clf_entropy.feature_count.items(), key=lambda item: item[1]))
feature_names = feature_sorted.keys()
print(feature_names)
feature_counts = feature_sorted.values()
print(feature_counts)

x_pos = [x for x,_ in enumerate(feature_names)]
plt.barh(x_pos, feature_counts, height=0.4)
plt.ylabel('feature names')
plt.xlabel('feature importance')
plt.xticks(np.arange(max(feature_counts)+1))
plt.yticks(x_pos, feature_names, rotation=45)
plt.gca().grid(axis='x', which='major')
plt.tight_layout()
plt.savefig('fi_entropy_d3.png', dpi=300, transparent=False)
plt.clf()

## Question 4(working)
implement the AdaBooest algorithm by using the CART you just implemented from question 2 as base learner. You should implement one arguments for the AdaBooest.
1. **n_estimators**: The maximum number of estimators at which boosting is terminated

In [None]:
class AdaBoost():
    def __init__(self, n_estimators, max_features, bootstrap, criterion):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.use_bootstrap = bootstrap
        self.criterion = criterion
        if criterion == 'gini':
            self.meas_func = gini
        else:
            self.meas_func = entropy
        self.n_trees = None
        return None

    def draw_bootstrap(self, x_data):
        # draw N data from x_data
        n_rows = [x_data[np.random.randint(len(x_data))] for iter in range(len(x_data))]
        return n_rows
    
    def fit(self, x_data, y_data):
        self.max_features = np.floor(np.sqrt(len(x_data[0]-1)))
        for b in range(self.n_estimators):
            if self.use_bootstrap == True:
                n_rows = self.draw_bootstrap(x_data=x_data)
            else:
                pass
        pass
    def print_acc(self, acc):
        print(f'criterion = {self.criterion}')
        print(f'max depth = {self.max_depth}')
        print(f'acc       = {acc}')
        print('====================')
    def predict(self, x_data):
        pass

In [None]:
ada_10est = AdaBoost(n_estimators=10)
ada_100est = AdaBoost(n_estimators=100)

### Question 4.1
Show the accuracy score of validation data by `n_estimators=10` and `n_estimators=100`, respectively.


## Question 5
implement the Random Forest algorithm by using the CART you just implemented from question 2. You should implement three arguments for the Random Forest.

1. **n_estimators**: The number of trees in the forest. 
2. **max_features**: The number of random select features to consider when looking for the best split
3. **bootstrap**: Whether bootstrap samples are used when building tree


In [None]:
import random


class RandomForest():
    def __init__(self, n_estimators, max_features, bootstrap=True, criterion='gini', max_depth=None):
        self.n_estimators = n_estimators
        self.max_features = int(np.round(max_features))
        self.use_bootstrap = bootstrap
        self.criterion = criterion
        self.max_depth = max_depth
        self.n_trees = []
        for i in range(self.n_estimators):
            self.n_trees.append(DecisionTree(self.criterion, self.max_depth))
        return None
    
    def fit(self, x_data, y_data):
        for iter in range(self.n_estimators):
            if self.use_bootstrap == True:
                # choose $(max_features) features from data
                n_cols = random.sample(range(x_data.shape[1]), k=self.max_features)
                # draw N random samples from dataset
                n_rows = np.random.randint(x_data.shape[0], size=len(x_data))
                rows = x_data[n_rows]
                rows = rows[:, n_cols]
                # print(train_df.columns[n_cols])
                # print(rows.shape)
                self.n_trees[iter].fit(rows, y_data[n_rows])
            else:
                self.n_trees[iter].fit(x_data=x_data, y_data=y_data)
        return
    def print_acc(self, acc):
        print(f'n estimators = {self.n_estimators}')
        print(f'max features = {self.max_features}')
        print(f'boostrap     = {self.use_bootstrap}')
        print(f'criterion    = {self.criterion}')
        print(f'max depth    = {self.max_depth}')
        print(f'acc          = {acc}')
        print('====================')
    
    def predict(self, x_data):
        mis_count = 0
        x_pred = []
        for row in x_data:
            row = row[np.newaxis, :]
            vote_now = []
            for tree_k in self.n_trees:
                _, pred = tree_k.predict(x_data=row)
                vote_now.append(pred)
            label, cnt = np.unique(vote_now, return_counts=True)
            vote_now = label[np.argmax(cnt)]
            x_pred.append(vote_now)
            if vote_now == row[:,-1]:
                mis_count = mis_count + 1
        acc = 1 - mis_count / len(x_data)
        self.print_acc(acc)
        return 

### Question 5.1
Using `criterion=gini`, `max_depth=None`, `max_features=sqrt(n_features)`, showing the accuracy score of validation data by `n_estimators=10` and `n_estimators=100`, respectively.


In [None]:
clf_10tree = RandomForest(n_estimators=10, max_features=np.sqrt(x_train.shape[1]), max_depth=None, criterion='gini')
clf_10tree.fit(x_data=x_train[:, 0:20], y_data=x_train[:, -1])
clf_10tree.predict(x_data=x_test)
clf_100tree = RandomForest(n_estimators=100, max_features=np.sqrt(x_train.shape[1]), max_depth=None, criterion='gini')
clf_100tree.fit(x_data=x_train[:, 0:20], y_data=x_train[:, -1])
clf_100tree.predict(x_data=x_test)

### Question 5.2
Using `criterion=gini`, `max_depth=None`, `n_estimators=10`, showing the accuracy score of validation data by `max_features=sqrt(n_features)` and `max_features=n_features`, respectively.


In [None]:
clf_random_features = RandomForest(n_estimators=10, max_features=np.sqrt(x_train.shape[1]))
clf_all_features = RandomForest(n_estimators=10, max_features=x_train.shape[1])

- Note: Use majority votes to get the final prediction, you may get slightly different results when re-building the random forest model

### Question 6. Train and tune your model on a real-world dataset
Try you best to get higher accuracy score of your model. After parameter tuning, you can train your model on the full dataset (train + val).
- Feature engineering
- Hyperparameter tuning
- Implement any other ensemble methods, such as gradient boosting. Please note that you **can not** call any package. Also, only ensemble method can be used. Neural network method is not allowed to used.

In [None]:
def train_your_model(data):
    ## Define your model and training 
    return

In [None]:
my_model = train_your_model(train_df)

In [None]:
y_pred = my_model.predict(x_test)

In [None]:
assert y_pred.shape == (500, )

## Supplementary
If you have trouble to implement this homework, TA strongly recommend watching [this video](https://www.youtube.com/watch?v=LDRbO9a6XPU), which explains Decision Tree model clearly. But don't copy code from any resources, try to finish this homework by yourself! 

### DO NOT MODIFY CODE BELOW

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

y_test = pd.read_csv('y_test.csv')['price_range'].values

print('Test-set accuarcy score: ', accuracy_score(y_test, y_pred))

In [None]:
def discrete_checker(score, thres, clf, name, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    if accuracy_score(y_test, y_pred) - thres >= 0:
        return score
    else:
        print(f"{name} failed")
        return 0


def patient_checker(score, thres, CLS, kwargs, name,
                    x_train, y_train, x_test, y_test, patient=10):
    while patient > 0:
        patient -= 1
        clf = CLS(**kwargs)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        if accuracy_score(y_test, y_pred) - thres >= 0:
            return score
    print(f"{name} failed")
    print("Considering the randomness, we will check it manually")
    return 0


def load_dataset():
    file_url = "http://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv"
    df = pd.read_csv(
        file_url,
        names=["Length", "Diameter", "Height", "Whole weight", "Shucked weight",
               "Viscera weight", "Shell weight", "Age"]
    )

    df['Target'] = (df["Age"] > 15).astype(int)
    df = df.drop(labels=["Age"], axis="columns")

    train_idx = range(0, len(df), 10)
    test_idx = range(1, len(df), 20)

    train_df = df.iloc[train_idx]
    test_df = df.iloc[test_idx]

    x_train = train_df.drop(labels=["Target"], axis="columns")
    feature_names = x_train.columns.values
    x_train = x_train.values
    y_train = train_df['Target'].values

    x_test = test_df.drop(labels=["Target"], axis="columns")
    x_test = x_test.values
    y_test = test_df['Target'].values
    return x_train, y_train, x_test, y_test, feature_names


score = 0

data = np.array([1, 2])
if abs(gini(data) - 0.5) < 1e-4:
    score += 2.5
else:
    print("gini test failed")

if abs(entropy(data) - 1) < 1e-4:
    score += 2.5
else:
    print("entropy test failed")

x_train, y_train, x_test, y_test, feature_names = load_dataset()

score += discrete_checker(5, 0.9337,
                          DecisionTree(criterion='gini', max_depth=3),
                          "DecisionTree(criterion='gini', max_depth=3)",
                          x_train, y_train, x_test, y_test
                          )

score += discrete_checker(2.5, 0.9036,
                          DecisionTree(criterion='gini', max_depth=10),
                          "DecisionTree(criterion='gini', max_depth=10)",
                          x_train, y_train, x_test, y_test
                          )

score += discrete_checker(2.5, 0.9096,
                          DecisionTree(criterion='entropy', max_depth=3),
                          "DecisionTree(criterion='entropy', max_depth=3)",
                          x_train, y_train, x_test, y_test
                          )

print("*** We will check your result for Question 3 manually *** (5 points)")

score += patient_checker(
    7.5, 0.91, AdaBoost, {"n_estimators": 10},
    "AdaBoost(n_estimators=10)",
    x_train, y_train, x_test, y_test
)

score += patient_checker(
    7.5, 0.87, AdaBoost, {"n_estimators": 100},
    "AdaBoost(n_estimators=100)",
    x_train, y_train, x_test, y_test
)

score += patient_checker(
    5, 0.91, RandomForest,
    {"n_estimators": 10, "max_features": np.sqrt(x_train.shape[1])},
    "RandomForest(n_estimators=10, max_features=sqrt(n_features))",
    x_train, y_train, x_test, y_test
)

score += patient_checker(
    5, 0.91, RandomForest,
    {"n_estimators": 100, "max_features": np.sqrt(x_train.shape[1])},
    "RandomForest(n_estimators=100, max_features=sqrt(n_features))",
    x_train, y_train, x_test, y_test
)

score += patient_checker(
    5, 0.92, RandomForest,
    {"n_estimators": 10, "max_features": x_train.shape[1]},
    "RandomForest(n_estimators=10, max_features=n_features)",
    x_train, y_train, x_test, y_test
)

print("*** We will check your result for Question 6 manually *** (20 points)")
print("Approximate score range:", score, "~", score + 25)
print("*** This score is only for reference ***")
