In [6]:
import pandas as pd
import numpy as np

In [7]:
# Read the dataset
df = pd.read_csv(r'C:\Users\91996\Downloads\Interviews Prep & Assignments\dataset.csv')

In [8]:
df["Output"] = np.random.choice(['Yes', 'No'], len(df))

In [9]:
df.head()

Unnamed: 0,FirstName,LastName,Gender,Age,Output
0,John,Smith,Male,45,Yes
1,Mike,Johnson,Male,34,Yes
2,Sam,Williams,Male,29,Yes
3,David,Jones,Male,62,Yes
4,Chris,Brown,Male,51,Yes


In [10]:
def unique_vals(rows, col):
    """Find the unique values for a column in a dataset"""
    return set([row[col] for row in rows])

In [11]:
unique_vals(df[slice(-1)], 2)

{'e', 'n', 'r', 's', 't'}

In [12]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [60]:
header = df.columns
header

Index(['FirstName', 'LastName', 'Gender', 'Age', 'Output'], dtype='object')

In [14]:
class Question:
        """A Question is used to partition a dataset.

    This class just records a 'column number' (e.g., 0 for Color) and a
    'column value' (e.g., Green). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the
    question. See the demo below.
    """
        def __init__(self, column, value):
            self.column = column
            self.value = value

        def match(self, example):
            # Compare the feature value in an example
            # to the value in this question

            val = example[self.column]
            if is_numeric(val):
                return val == self.value
            else:
                return val == self.value
            
        def __repr__(self):
            # THis is just a helper method to print
            # the question in a readable format
            condition = "=="
            if is_numeric(self.value):
                condition = ">="
            return "Is %s %s %s?" % (
                header[self.column], condition, str(self.value))

In [15]:
# Qwestion for numeric attrivbute
Question(3,30)

Is Age >= 30?

In [16]:
# Question for categorical attribute

q = Question(2, 'Male')

In [17]:
example = df.loc[152]

In [18]:
example

FirstName      Poppy
LastName     Ramirez
Gender        Female
Age               26
Output           Yes
Name: 152, dtype: object

In [19]:
q.match(example)

False

In [58]:
def partition(rows, question):
        
        """Partitions a dataset.

    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
        true_rows, false_rows = [], []
        for row in rows:
            # row = row.tolist()
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows

In [21]:
true_rows, false_rows = partition(df.iterrows(), Question(2, "Male"))

In [46]:
false_rows[0:2]

[['Emma', 'Johnson', 'Female', 23, 'Yes'],
 ['Mia', 'Smith', 'Female', 34, 'No']]

In [37]:
def class_counts(rows):
    
    """Counts the number of each type of example in a dataset."""
    counts = {} # a dict of label:count
    for row in rows:
        # in our dataset format the label is always last col
        print(row)
        # row = row.tolist()
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


In [38]:
def gini(rows):
    
    """Calculate the Gini Impurity for a list of rows.

    There are a few different ways to do this, I thought this one was
    the most concise. """
    counts = class_counts(rows)
    impurity = 1
    # print(counts)
    for lbl in counts:
        prob_of_lbl = counts[lbl]/float(len(df))
        impurity -= prob_of_lbl**2

    return impurity

In [47]:
some_mixing = false_rows[0:2]
# this will return 0
gini(some_mixing)

['Emma', 'Johnson', 'Female', 23, 'Yes']
['Mia', 'Smith', 'Female', 34, 'No']


0.999914562775001

In [48]:
def info_gain(left, right, current_uncertainity):
    """Information Gain.
    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainity - p * gini(left) - (1-p)*gini(right)

In [49]:
# Calculate the uncertainy of our training data.
current_uncertainity = gini(df)

FirstName
LastName
Gender
Age
Output


In [50]:
current_uncertainity

0.9995300952625058

In [51]:
# How much information do we gain by partioning on 'Male'?
true_rows, false_rows = partition(df.iterrows(), Question(2, 'Male'))
info_gain(true_rows, false_rows, current_uncertainity)

['John', 'Smith', 'Male', 45, 'Yes']
['Mike', 'Johnson', 'Male', 34, 'Yes']
['Sam', 'Williams', 'Male', 29, 'Yes']
['David', 'Jones', 'Male', 62, 'Yes']
['Chris', 'Brown', 'Male', 51, 'Yes']
['Mark', 'Davis', 'Male', 73, 'No']
['Paul', 'Miller', 'Male', 37, 'Yes']
['Brian', 'Wilson', 'Male', 25, 'No']
['Kevin', 'Moore', 'Male', 48, 'Yes']
['Jason', 'Taylor', 'Male', 53, 'No']
['Andrew', 'Anderson', 'Male', 39, 'Yes']
['Steven', 'Thomas', 'Male', 31, 'Yes']
['Eric', 'Jackson', 'Male', 28, 'No']
['Jake', 'White', 'Male', 66, 'Yes']
['Ryan', 'Harris', 'Male', 44, 'Yes']
['Adam', 'Martin', 'Male', 59, 'Yes']
['Aaron', 'Thompson', 'Male', 23, 'No']
['Bob', 'Garcia', 'Male', 36, 'Yes']
['Charles', 'Martinez', 'Male', 41, 'Yes']
['Scott', 'Robinson', 'Male', 26, 'Yes']
['Frank', 'Clark', 'Male', 38, 'Yes']
['Jared', 'Rodriguez', 'Male', 48, 'No']
['Patrick', 'Lewis', 'Male', 70, 'Yes']
['Daniel', 'Lee', 'Male', 32, 'Yes']
['George', 'Walker', 'Male', 50, 'Yes']
['Tyler', 'Hall', 'Male', 57, '

0.1704846775596336

In [None]:
# How much information do we gain by partioning on Age 25?
true_rows, false_rows = partition(df.iterrows(), Question(3, 25))
info_gain(true_rows, false_rows, current_uncertainity)

['Brian', 'Wilson', 'Male', 25, 'No']
['Roy', 'Perez', 'Male', 25, 'Yes']
['Autumn', 'Cook', 'Female', 25, 'Yes']
['John', 'Smith', 'Male', 45, 'Yes']
['Mike', 'Johnson', 'Male', 34, 'Yes']
['Sam', 'Williams', 'Male', 29, 'Yes']
['David', 'Jones', 'Male', 62, 'Yes']
['Chris', 'Brown', 'Male', 51, 'Yes']
['Mark', 'Davis', 'Male', 73, 'No']
['Paul', 'Miller', 'Male', 37, 'Yes']
['Kevin', 'Moore', 'Male', 48, 'Yes']
['Jason', 'Taylor', 'Male', 53, 'No']
['Andrew', 'Anderson', 'Male', 39, 'Yes']
['Steven', 'Thomas', 'Male', 31, 'Yes']
['Eric', 'Jackson', 'Male', 28, 'No']
['Jake', 'White', 'Male', 66, 'Yes']
['Ryan', 'Harris', 'Male', 44, 'Yes']
['Adam', 'Martin', 'Male', 59, 'Yes']
['Aaron', 'Thompson', 'Male', 23, 'No']
['Bob', 'Garcia', 'Male', 36, 'Yes']
['Charles', 'Martinez', 'Male', 41, 'Yes']
['Scott', 'Robinson', 'Male', 26, 'Yes']
['Frank', 'Clark', 'Male', 38, 'Yes']
['Jared', 'Rodriguez', 'Male', 48, 'No']
['Patrick', 'Lewis', 'Male', 70, 'Yes']
['Daniel', 'Lee', 'Male', 32, 'Y

0.47077921262058575

In [None]:
def find_best_split(rows):
    """Find the best question to ask by iterating over every
    feature and calculating the information gain"""
    # rows = rows.tolist()
    
    best_gain = 0
    best_question = None
    current_uncertainity = gini(rows)
    # n_features = len(rows[0]) - 1 # number of cols
    n_features = len(header)
    
    for col in range(len(header)):
        # print(col)
        for row in rows:
            # print("row")
            pass
        # row_val = []
        # values = set([row_val.append(row[col]) for row in rows])
        values = set(row[col] for row in rows)
        # print(values)
        for val in values:
            print(val)
            question = Question(col, val)
            # print(question)
            # splitting the data
            true_rows, false_rows = partition(rows, question)

            # skip this split if it doesn't divide the data

            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calc information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainity)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question


best_gain, best_question = find_best_split(df)
best_question

FirstName
LastName
Gender
Age
Output
0
None
1
None
2
None
3


IndexError: string index out of range

In [None]:
best_gain, best_question = find_best_split(df.iterrows())
best_question

set()
set()
set()
set()
set()
