In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('train.csv')
display(data.head(20))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin", "Embarked"]
data_clean = data.drop(columns_to_drop, axis=1)
data_clean = data_clean.fillna(data_clean["Age"].mean())

In [7]:
data_clean.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [8]:
input_cols = ['Pclass',"Sex","Age","SibSp","Parch","Fare"]
output_cols = ["Survived"]

X = data_clean[input_cols]
Y = data_clean[output_cols]

In [9]:
def entropy(cols):
    counts = np.unique(cols, return_counts = True)
    ent = 0.0
    for ix in counts[1]:
        pi = ix/len(cols)
        ent += -1*pi*np.log2(pi)
    return ent - pi

In [10]:
def info_gain(data_clean, attr, target):
    if isinstance(data_clean[attr].dtype, pd.CategoricalDtype) or data_clean[attr].dtype == object:
        val_uniques = np.unique(data_clean[attr])
        gain = 0.0
        for val in val_uniques:
            subset = data_clean[data_clean[attr] == val] [target]
            x = len(subset)/len(data_clean)
            gain += x * entropy(subset)
        best_gain = entropy(data_clean[target]) - gain
        return best_gain, None
    else:
        val_uniques = np.sort(np.unique(data_clean[attr]))
        best_gain = -np.inf
        best_thresh = 0
        thresholds = [(val_uniques[i] + val_uniques[i + 1])/2 for i in range(len(val_uniques) - 1)]
        for thresh in thresholds:
            left = data_clean[data_clean[attr] <= thresh] [target]
            right = data_clean[data_clean[attr] > thresh] [target]
            gain = entropy(data_clean[target]) - (len(left)/len(data_clean[attr]) * entropy(left) + len(right)/len(data_clean[attr]) * entropy(left))
            if gain > best_gain:
                best_gain = gain
                best_thresh = thresh
        return best_gain, best_thresh
        

In [11]:
class MixedTree:
    def __init__(self, depth = 0, max_depth = 5):
        self.depth = depth
        self.max_depth = max_depth
        self.children = {}
        self.threshold = None
        self.attr = None
        self.target = None
    def train(self, data_clean, features, target):
        if len(np.unique(data_clean[target])) == 1 or len(features) == 0 or self.depth >= self.max_depth:
            if pd.api.types.is_numeric_dtype(data_clean[target]):
                self.target = data_clean[target].mean()
                return
            else:
                self.target = data_clean[target].mode()[0]
                return
        results = [info_gain(data_clean, attr, target) for attr in features]
        gain = [x[0] for x in results]
        best_id = np.argmax(gain)
        best_attr = features[best_id]
        self.attr = best_attr
        if isinstance(data_clean[best_attr].dtype, pd.CategoricalDtype) or data_clean[best_attr].dtype == object:
            val_uniques = np.unique(data_clean[best_attr])
            for v in val_uniques:
                subset = data_clean[data_clean[best_attr] == v]
                child = MixedTree(depth = self.depth + 1, max_depth = self.max_depth)
                child.train(subset, list(filter(lambda f : f != best_attr, features)), target)
                self.children[v] = child
        else:
            best_gain, best_threshold = results[best_id]
            if best_gain == 0 or np.isnan(best_gain):
                self.target = data_clean[target].mean()
                return
            self.threshold = best_threshold
            left = data_clean[data_clean[best_attr] <= best_threshold]
            right = data_clean[data_clean[best_attr] > best_threshold]
            child_left = MixedTree(depth = self.depth + 1, max_depth = self.max_depth)
            child_left.train(left, list(filter(lambda f : f != best_attr, features)), target)
            child_right = MixedTree(depth = self.depth + 1, max_depth = self.max_depth)
            child_right.train(right, list(filter(lambda f : f != best_attr, features)), target)
            self.children['left'] = child_left
            self.children['right'] = child_right
    def predict(self, row):
        if self.attr is None or self.children == {}:
            if self.target is not None:
                    return self.target
            else:
                    return 0
        val = row[self.attr]
        if self.threshold is not None:
            if val <= self.threshold:
                return self.children['left'].predict(row)
            else:
                return self.children['right'].predict(row)
        else:
            if val in self.children and self.children[val] is not None:
                return self.children[val].predict(row)
            else:
                if self.target is not None:
                    return self.target
                else:
                    return 0
                 

In [12]:
split = int(0.7 * len(data_clean))
train_data = data_clean[:split]
test_data = data_clean[split:]

In [13]:
tree = MixedTree(max_depth = 6)
tree.train(train_data, input_cols, "Survived")

Como os dados não estão organizados em uma ordem relevante, podemos utilizar a divisão de dados sem usar o suffle

In [14]:
for i in range (len (test_data)):
    test_row = test_data.iloc[i]
    print(f"Example {i + 1}: Predicted = {tree.predict(test_row)}, Actual = {test_row['Survived']}")


Example 1: Predicted = 0.14743589743589744, Actual = 0
Example 2: Predicted = 0.14743589743589744, Actual = 0
Example 3: Predicted = 0.21505376344086022, Actual = 0
Example 4: Predicted = 0.21505376344086022, Actual = 0
Example 5: Predicted = 0.9814814814814815, Actual = 1
Example 6: Predicted = 0.14743589743589744, Actual = 0
Example 7: Predicted = 0.14743589743589744, Actual = 0
Example 8: Predicted = 0.21505376344086022, Actual = 1
Example 9: Predicted = 0.045454545454545456, Actual = 0
Example 10: Predicted = 0.21505376344086022, Actual = 1
Example 11: Predicted = 0.0, Actual = 0
Example 12: Predicted = 0.5641025641025641, Actual = 0
Example 13: Predicted = 0.7285714285714285, Actual = 1
Example 14: Predicted = 0.14743589743589744, Actual = 0
Example 15: Predicted = 0.4642857142857143, Actual = 0
Example 16: Predicted = 0.6956521739130435, Actual = 0
Example 17: Predicted = 0.13043478260869565, Actual = 0
Example 18: Predicted = 0.14743589743589744, Actual = 0
Example 19: Predicted

In [15]:
def mse(y_test, y_pred):
    y_test = np.array(y_test, dtype=int)
    y_pred = np.array(y_pred, dtype=int)
    return np.mean((y_test - y_pred) ** 2)
y_test = test_data['Survived'].values
y_pred = [tree.predict(test_data.iloc[i]) for i in range (len(test_data))]
val = mse(y_test, y_pred)
rmse = np.sqrt(val)

def rform(y_test, y_pred):
    result = 1 - np.sum((y_test - y_pred)**2)/ np.sum(((y_test - y_test.mean())**2))
    return result
val_r = rform(y_test, y_pred)

                                                    
print(f"R-Squared: {val_r}")
print(f"MSE: {val}")
print(f"RMSE:{rmse}")

R-Squared: 0.3701601906644997
MSE: 0.34701492537313433
RMSE:0.5890797275183847


Sempre dividir os dados para teste e treino