In [121]:
import numpy as np 
import pandas as pd

In [122]:
df = pd.read_csv('Titanic.csv')
df.head()
pd.unique(df.columns)

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [123]:
df.drop(['PassengerId','Name','Ticket','Cabin',"Embarked"],axis=1,inplace=True)

In [124]:
df['Sex']=df["Sex"].map({"male":0,"female":1}).astype(int)

In [125]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

def z_norm(x):
    return x-x.mean()/x.std()
df.head()

df['Age']=z_norm(df['Age'])

df['Fare']=z_norm(df['Fare'])
df.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,19.715807,1,0,6.601942
1,1,1,1,35.715807,1,0,70.635242
2,1,3,1,23.715807,0,0,7.276942
3,1,1,1,32.715807,1,0,52.451942
4,0,3,0,32.715807,0,0,7.401942


In [126]:
x = df.iloc[:, 1:5]
y = df.iloc[:, :1]
z= df.idxmax(axis=0)
z
# np.argmax(y)
# y

Survived      1
Pclass        0
Sex           1
Age         630
SibSp       159
Parch       678
Fare        258
dtype: int64

In [127]:
def bootstrap_sample(x,y):
    n_samples =x.shape[0]
    indices =np.random.choice(n_samples,size= n_samples,replace=True)
    x_sample = x.iloc[indices,:]
    y_sample = y.iloc[indices,:]
    return x_sample,y_sample

In [128]:
def entropy(y):
    unique,counts =np.unique(y,return_counts=True)
    prob = counts/sum(counts)
    return -np.sum(prob*np.log2(prob+1e-9))

def information_gain(x,y,threshold):
    n=len(y)
    xleft_ind =x <=threshold
    xright_ind= x>threshold
    if (sum(xleft_ind)==0 or sum(xright_ind)==0):
        return 0
    leftentropy=entropy(y[xleft_ind])
    rightentropy= entropy(y[xright_ind])

    nl,nr = sum(xleft_ind),sum(xright_ind)

    weighted_entropy =(nl/n)*leftentropy+(nr/n)*rightentropy

    return entropy(y)-weighted_entropy

class Node:
    def __init__(self,feature=None,threshold=None,left=None,right=None,value=None):
        self.feature =feature
        self.threshold =threshold
        self.left =left
        self.right =right
        self.value =value

class DT:
    def __init__(self,max_depth=None):
        self.max_depth =max_depth
        self.root =None
    
    def fit(self,x,y):
        self.root=self.grow_tree(x,y)
    
    def most_common_label(self,y):
        unique,counts =np.unique(y,return_counts= True)
        return unique[np.argmax(counts)]
    
    def grow_tree(self,x,y,depth=0):

        n_samples,n_features =x.shape
        n_labels= len(np.unique(y))

        if (depth>= self.max_depth or n_labels ==1 or n_samples==0):
            leaf_value = self.most_common_label(y)
            return Node(value=leaf_value)
        
        best_gain=-1
        split_feature,split_thresh =None,None
        for feature_id in range(n_features):
            x_column= x.iloc[:,feature_id]
            thresholds =np.unique(x_column)

            for threshold in thresholds:
                gain =information_gain(x_column,y,threshold)
                if gain>best_gain:
                    best_gain=gain
                    split_thresh=threshold
                    split_feature=feature_id
        
        left_indices = x.iloc[:,split_feature]<=split_thresh
        right_indices = x.iloc[:,split_feature]>split_thresh

        left= self.grow_tree(x[left_indices],y[left_indices],depth+1)
        right =self.grow_tree(x[right_indices],y[right_indices],depth+1)

        return Node(split_feature,split_thresh,left,right)
    
    def traverse_tree(self,x,node):
        if node.value is not None:
            return node.value
        if x[node.feature]<= node.threshold:
            return self.traverse_tree(x,node.left)
        return self.traverse_tree(x,node.right)
    
    def predict(self,x):
        return np.array([self.traverse_tree(x,self.root)])

In [129]:
def train_dt(x_train,y_train,max_depth=None):
    dt= DT(max_depth)
    dt.fit(x_train,y_train)
    return dt

In [130]:
def bagging(X_train, y_train, n_trees=10, max_depth=None):
    trees = []
    for _ in range(n_trees):
        X_sample, y_sample = bootstrap_sample(X_train, y_train)
        tree = DT(max_depth)
        tree.fit(X_sample,y_sample)
        trees.append(tree)
    return trees

In [131]:
def predict_with_bagging(x,tree):
    predictions = np.array([tree.predict(xi) for xi in x])
    return predictions.T

In [132]:
from sklearn.model_selection import train_test_split
# df.dtypes
for i in range(100):
    x = df.iloc[:, 1:5]
    y = df.iloc[:, :1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    colls=len(x.columns)-1
    n_trees = colls**0.5
    n_trees = int(n_trees)
    max_depth = 5
    x_train = x_train.apply(pd.to_numeric, errors='coerce')  # Converts all columns to numeric
    x_test = x_test.apply(pd.to_numeric, errors='coerce')    # Same for test set

    trees = bagging(x_train, y_train, n_trees=n_trees, max_depth=max_depth)
    
    y_pred  =[]
    yprd= predict_with_bagging(x_test, trees[0])
    
    
    accuracy = np.mean(y_pred == y_test.values.flatten())
    print(f"Accuracy with Bagging: {accuracy*100:.4f}")

UFuncTypeError: ufunc 'greater_equal' did not contain a loop with signature matching types (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.StrDType'>) -> None