In [5]:
import numpy as np
import pandas as pd
import csv

In [9]:
import csv

# Load Titanic dataset using CSV module
titanic_data = []
with open('Titanic-Dataset.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        titanic_data.append(row)


In [11]:

# Perform basic preprocessing
for entry in titanic_data:
    entry['Age'] = float(entry['Age']) if entry['Age'] else 0.0
    entry['Fare'] = float(entry['Fare']) if entry['Fare'] else 0.0
    entry['Survived'] = int(entry['Survived'])


In [15]:

# Define decision tree model
def build_tree(data, features):
    # Calculate the impurity of the current node
    def impurity(data):
        total = len(data)
        survived = sum(1 for entry in data if entry['Survived'] == 1)
        not_survived = total - survived
        if total == 0:  # Check for division by zero
            return 0
        survival_rate = max(survived, not_survived) / total
        return 1 - survival_rate ** 2 - (1 - survival_rate) ** 2

    # Recursive function to split data and build tree
    def split(node, depth):
        if depth >= max_depth or len(node['data']) < min_samples_split:
            return
        best_impurity = float('inf')
        best_feature = None
        best_threshold = None
        for feature in features:
            values = set(entry[feature] for entry in node['data'])
            for value in values:
                left_data = [entry for entry in node['data'] if entry[feature] <= value]
                right_data = [entry for entry in node['data'] if entry[feature] > value]
                impurity_gain = impurity(node['data']) - (len(left_data) / len(node['data'])) * impurity(left_data) - (len(right_data) / len(node['data'])) * impurity(right_data)
                if impurity_gain < best_impurity:
                    best_impurity = impurity_gain
                    best_feature = feature
                    best_threshold = value
        if best_feature is None:
            return
        node['feature'] = best_feature
        node['threshold'] = best_threshold
        node['left'] = {'data': [entry for entry in node['data'] if entry[best_feature] <= best_threshold]}
        node['right'] = {'data': [entry for entry in node['data'] if entry[best_feature] > best_threshold]}
        del node['data']
        split(node['left'], depth + 1)
        split(node['right'], depth + 1)

    max_depth = 5
    min_samples_split = 10

    root = {'data': data}
    split(root, 0)
    return root


In [16]:

# Define prediction function
def predict(tree, entry):
    def extract_numerical_features(entry):
        return {k: v for k, v in entry.items() if isinstance(v, (int, float))}

    if 'data' in tree:
        return max(0, min(1, sum(extract_numerical_features(entry).values()) / len(extract_numerical_features(entry))))  # Dummy prediction for demo
    if entry[tree['feature']] <= tree['threshold']:
        return predict(tree['left'], entry)
    else:
        return predict(tree['right'], entry)


In [17]:

# Define features and build tree model
features = ['Age', 'Sex', 'Pclass', 'Fare']
tree_model = build_tree(titanic_data, features)


In [18]:

# Example usage: make predictions for new data points
new_entry = {'Age': 25, 'Sex': 'female', 'Pclass': 2, 'Fare': 50}
prediction = predict(tree_model, new_entry)
print(prediction)



1
