In [None]:
 #Problem-2: Implementation of B Tree
 #Store customer data from the given file in B Tree using First-Name as key
 # Store the same data in any built-in Collection.
 # Search same 100 random words from both data structures and count comparisons
 # Print the results,  ideally searching phenomenon in B Tree is faster than any other data-structure.

B Tree Class

In [None]:
# Create class for Nodes
class BTreeNode:
    def __init__(self, leaf=True):
        self.leaf = leaf  #default each new node to be a leaf
        self.keys = []      # list of keys in this node
        self.values = []    # parallel list of values
        self.children = []  # list of child pointers (BTreeNode objects)


# Create class for Tree
class BTree:
    def __init__(self, t):
        self.root = BTreeNode(leaf=True) ##inital root node
        self.t = t  # minimum degree of tree, or the number of keys allowed per node

    def insert(self, key, value):
        root = self.root #start with the current root of the tree

        # If the root already has the max # of keys (2t-1), create a new root node
        if len(root.keys) == (2 * self.t) - 1:
            new_root = BTreeNode(leaf=False) #not a leaf bc it will have a child
            new_root.children.append(root)  #add the old root as a child of the new root
            self._split_child(new_root, 0) #split the old root because it's full
            self.root = new_root #new root is now the root of the tree
            self._insert_nonfull(new_root, key, value) #insert the key now that the root is not full
        else:
            # if the root is not full, go ahead and insert the key.
            self._insert_nonfull(root, key, value)

    def _split_child(self, parent, i):
        t = self.t  # get the min degree
        y = parent.children[i]           # the node that is full and needs to be split
        new_node = BTreeNode(leaf=y.leaf)       # create a new node that is a leaf if y was also a leaf

        # Middle key and value that will move up to the parent
        mid_key = y.keys[t - 1] #get median key
        mid_val = y.values[t - 1] #get median values

        # new_node gets the right half of y's keys/values
        new_node.keys = y.keys[t:]
        new_node.values = y.values[t:]

        # If y has children, split those to the new rode as well
        if not y.leaf:
            new_node.children = y.children[t:]
            y.children = y.children[:t]

        # y keeps the left half of keys/values
        y.keys = y.keys[:t - 1]
        y.values = y.values[:t - 1]

        # Insert new_node as a new child of parent
        parent.children.insert(i + 1, new_node)

        # Insert the middle key/value into parent
        parent.keys.insert(i, mid_key)
        parent.values.insert(i, mid_val)

    def _insert_nonfull(self, node, key, value):
        i = len(node.keys) - 1 #Start from the end and work backwards

        if node.leaf:
            node.keys.append(None)
            node.values.append(None)
            #move nodes that are larger than the current key to the right
            while i >= 0 and key < node.keys[i]:
                node.keys[i + 1] = node.keys[i]
                node.values[i + 1] = node.values[i]
                i -= 1
            #once the current key is > the key we are comparing against, place it to the right
            node.keys[i + 1] = key
            node.values[i + 1] = value
        else:
            #if the node is not a leaf, find which child it should be added to.
            while i >= 0 and key < node.keys[i]:
                i -= 1
            i += 1

            # If that child is full, split it
            if len(node.children[i].keys) == (2 * self.t) - 1:
                self._split_child(node, i)
                # After split, decide which of the two children to add it to
                if key > node.keys[i]:
                    i += 1

            self._insert_nonfull(node.children[i], key, value)

    def search (self, key, node=None, comparisons=0):
      if node is None: #if no node is provided, start at the root
            node = self.root

      # Find the first index i such that key <= node.keys[i]
      i = 0
      while i < len(node.keys) and key > node.keys[i]:
          comparisons += 1 # Count comparison with node.keys[i]
          i += 1

      # If key is found in this node, return its value and comparison count
      if i < len(node.keys) and key == node.keys[i]:
          comparisons += 1 # Count comparison key == node.keys[i]
          return node.values[i], comparisons
      # If this is a leaf node, and we didn't find the key, it doesn't exist
      if node.leaf:
          return None, comparisons # Return None and total comparisons
      # Otherwise, search the child node
      # Pass the current comparison count to the recursive call
      return self.search(key, node.children[i], comparisons)


      #print the tree to see it is working
    def print_tree(self, node=None, level=0):
      if node is None:
        node = self.root
      print("    " * level + str(node.keys))
      if not node.leaf:
        for child in node.children:
            self.print_tree(child, level + 1)

Read in file & create B Tree

In [None]:
import os                       #Used to manage file system
import urllib.request           #Used to obtain file from GitHub
import pandas as pd             #Used for data manipulation
import csv

customer_tree = BTree(t=3)
#filename = 'customers-100.csv'

#JVinas 2025-11-19 - Modified code to download csv from GitHub to ensure usability

source_file_url = "https://raw.githubusercontent.com/joelvinas/COMP-SCI_5501/refs/heads/main/Assignment4/Data/customers-100.csv"
filename = "/tmp/customers-100.csv" # Use a temporary path

# Download the file
try:
    urllib.request.urlretrieve(source_file_url, filename)
    print(f"Source File downloaded successfully to: {filename}")
except Exception as e:
    print(f"Error downloading source file: {e}")

with open(filename, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        key = row["First Name"]
        customer_tree.insert(key, [row])

customer_tree.print_tree()

Source File downloaded successfully to: /tmp/customers-100.csv
['Darren', 'Jenna', 'Maxwell', 'Samuel']
    ['Alexandria', 'Brett', 'Carl', 'Chad', 'Colleen']
        ['Aimee', 'Alejandro']
        ['Alison', 'Anita', 'Brady']
        ['Brittany', 'Bruce', 'Bryan', 'Candice']
        ['Caroline', 'Cassidy']
        ['Chloe', 'Clarence', 'Clifford']
        ['Collin', 'Corey', 'Dakota', 'Danny', 'Darrell']
    ['Faith', 'Frederick', 'Greg']
        ['Debra', 'Duane', 'Eddie', 'Eileen', 'Emma']
        ['Faith', 'Fernando', 'Fred']
        ['Gabriel', 'Geoffrey', 'Gerald', 'Gloria']
        ['Hunter', 'Jack', 'Janet']
    ['Kaitlyn', 'Kelli', 'Kristine', 'Luis']
        ['Joanna', 'Joanna', 'Joanne', 'Jordan']
        ['Karl', 'Kathleen', 'Kathy']
        ['Kent', 'Kiara']
        ['Latoya', 'Laurie', 'Leslie', 'Linda', 'Lori']
        ['Lynn', 'Lynn', 'Makayla', 'Marcus']
    ['Natalie', 'Preston', 'Riley']
        ['Michelle', 'Miranda']
        ['Nicholas', 'Nina', 'Patricia', 'Phylli

#since in assignment, they asked to search 100 records, i will be using random.choice which will also include duplicate values. professor has not mentioned unique value, so this is fine and also it wont affect the comparison in either list of tree, they both go through duplicates too

hence replacing just two lines here

In [None]:
# Store the same data in any built-in Collection.
import csv
import random

rows = []
with open(filename, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for r in reader:
        rows.append(r)

pass

# Generate 100 random search keys from the first names
#all_first_names = list(index.keys())
all_first_names = [row["First Name"] for row in rows]
# Always generate 100 search keys â€” allows repeated names
search_keys = [random.choice(all_first_names) for _ in range(100)]

def list_search_with_count(lst, key):
    comparisons = 0
    for row in lst:
        comparisons += 1
        if row["First Name"] == key:
            return row, comparisons
    return None, comparisons

#replace dictinary with list as because dictionary will always takw just 1 comparison, no matter how big dataset is ...while list willl take multiple comparison,

In [None]:
# Search same 100 random words from both data structures and count comparisons
bt_counts = []
bt_hits = 0
list_counts = []
list_hits = 0

for k in search_keys:
   # --- B-TREE SEARCH ---
    _, c_bt = customer_tree.search(k)
    bt_counts.append(c_bt)
    bt_hits += 1 if _ is not None else 0

    # --- LIST SEARCH ---
    _, c_list = list_search_with_count(rows, k)
    list_counts.append(c_list)
    list_hits += 1 if _ is not None else 0


print(f"B-Tree: hits={bt_hits}/{len(search_keys)}, "
      f"avg={sum(bt_counts)/len(bt_counts):.2f}, "
      f"min={min(bt_counts)}, max={max(bt_counts)}")

print(f"List:   hits={list_hits}/{len(search_keys)}, "
      f"avg={sum(list_counts)/len(list_counts):.2f}, "
      f"min={min(list_counts)}, max={max(list_counts)}")

print("First 10 B-Tree comparisons:", bt_counts[:10])
print("First 10 List comparisons:", list_counts[:10])

B-Tree: hits=100/100, avg=5.84, min=1, max=11
List:   hits=100/100, avg=46.77, min=1, max=99
First 10 B-Tree comparisons: [6, 8, 6, 9, 8, 5, 10, 2, 5, 11]
First 10 List comparisons: [31, 89, 93, 4, 40, 1, 26, 15, 42, 99]
