<a href="https://colab.research.google.com/github/jhbellingrath/CS-290-Classwork/blob/main/notebooks/gp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import math

In [None]:
df=pd.read_csv('https://raw.githubusercontent.com/jhbellingrath/CS-290-Classwork/refs/heads/main/data/survey%20lung%20cancer.csv')

In [None]:
target="LUNG_CANCER"

In [None]:
attributes=df.columns[df.columns!=target]
attributes=attributes.drop("AGE")
attributes

Index(['GENDER', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE',
       'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN'],
      dtype='object')

In [None]:
# entropy function for qualitative
def entropy(attribute):
  entropy = 0
  total = len( df )
  vals = df[attribute].unique()
  for val in vals:
    counts = len(df[ df[attribute] == val ])
    weight = counts / total
    props = df[ df[attribute] == val ][target].value_counts( normalize=True )
  for p in props.array:
    entropy =  entropy - weight*(p*math.log2(p))
  return entropy

In [None]:
# Gini function for qualitative
def gini(attribute):
    vals=df[attribute].unique()
    impurity=1
    for val in vals:
      impurity-=(len(df[df[attribute]==val])/len(df))**2
    return impurity

In [None]:
# Computes Entropies and Gini impurities for each qualitative split
entropies=[]
ginis=[]
for attribute in attributes:
  entropies.append([entropy(attribute),attribute])
  ginis.append([gini(attribute),attribute])
entropies.sort()
ginis.sort()
print(entropies)
print(ginis)

[[0.10559637699403665, 'ALLERGY '], [0.173117099729669, 'PEER_PRESSURE'], [0.21992131477035415, 'CHRONIC DISEASE'], [0.22188330357899055, 'SHORTNESS OF BREATH'], [0.23466973341918906, 'FATIGUE '], [0.28013677250726143, 'SMOKING'], [0.28971295630473837, 'GENDER'], [0.3068095912785343, 'YELLOW_FINGERS'], [0.31747205281380564, 'CHEST PAIN'], [0.3221585989549134, 'COUGHING'], [0.33468388433363483, 'ANXIETY'], [0.3362029448298184, 'WHEEZING'], [0.34768484145841194, 'ALCOHOL CONSUMING'], [0.3907992765733175, 'SWALLOWING DIFFICULTY']]
[[0.44004566353515356, 'FATIGUE '], [0.4603638420209256, 'SHORTNESS OF BREATH'], [0.4874268179009437, 'COUGHING'], [0.4903174453556205, 'YELLOW_FINGERS'], [0.49203506456781987, 'SMOKING'], [0.49358511117395076, 'ALCOHOL CONSUMING'], [0.49358511117395076, 'ALLERGY '], [0.49358511117395076, 'CHEST PAIN'], [0.49358511117395076, 'WHEEZING'], [0.49810957153779284, 'SWALLOWING DIFFICULTY'], [0.49882175511358273, 'GENDER'], [0.49995287020454326, 'CHRONIC DISEASE'], [0.

In [None]:
#  entropy for a quantitative variable
def entropy_quantitative(attribute):
  entropies=[]
  total = len(df)
  vals = df[attribute].unique()
  for val in vals:
    entropy_value = 0
    # Left split
    left = df[df[attribute] <= val][[attribute, target]]
    props = left[target].value_counts(normalize=True)
    weight = len(left) / total
    for prop in props.array:
      entropy_value -= weight * prop * math.log2(prop)

    # Right split
    right = df[df[attribute] > val][[attribute, target]]
    props = right[target].value_counts(normalize=True)
    weight = len(right) / total
    for prop in props.array:
      entropy_value -= weight * prop * math.log2(prop)

    entropies.append([entropy_value,val])
  entropies.sort()
  return entropies
  #returns a list of [entropy value,split value]

In [None]:
Age_entropies=entropy_quantitative("AGE")
Age_entropies

[[0.5308987710238049, 71],
 [0.5312781918116776, 69],
 [0.5329683781322272, 70],
 [0.536762079214385, 72],
 [0.5372453003865665, 21],
 [0.5372453003865665, 81],
 [0.5389240759469978, 73],
 [0.5403639319600874, 63],
 [0.5403704065336898, 68],
 [0.5413412661776232, 64],
 [0.541865421817934, 47],
 [0.5419171016038008, 74],
 [0.5428368596279726, 46],
 [0.5431019303885338, 38],
 [0.5433445346854594, 65],
 [0.5435642785740513, 60],
 [0.5436483858080063, 48],
 [0.543673431972618, 61],
 [0.5441094582555415, 75],
 [0.5443048692770733, 66],
 [0.5443885623004059, 54],
 [0.5449207067118653, 59],
 [0.5449269217235458, 39],
 [0.5449269217235458, 79],
 [0.5452315737098028, 67],
 [0.5453678280067275, 49],
 [0.5455754495027709, 76],
 [0.5456500469398374, 62],
 [0.5458932366463729, 78],
 [0.5462869742620604, 53],
 [0.54645058643079, 44],
 [0.5464872138073784, 57],
 [0.5467598985291404, 56],
 [0.5467661501842692, 77],
 [0.5468142008091454, 52],
 [0.5468750719468112, 58],
 [0.5469124217816319, 55],
 [0.54

In [None]:
# gini impurity for a quantitative variable with a pecified split value
def gini_quantitative(attribute,split_val):
  impurity = 1
  # Left split
  impurity -= (len(df[df[attribute] <= split_val]) / len(df))**2

  # Right split
  impurity -= (len(df[df[attribute] > split_val]) / len(df))**2
  return impurity

In [None]:
print(gini_quantitative("AGE",71))

0.24423707334443498


In [None]:
# LabelEncoder
le = LabelEncoder()

In [None]:
# target variable
df["LUNG_CANCER"] = le.fit_transform(df["LUNG_CANCER"])

In [None]:
# Compare
X = df.drop(columns=["LUNG_CANCER"])
y = df["LUNG_CANCER"]

In [None]:
# Iterate over all columns in X
for col in X.columns:
    if X[col].dtype == 'object':  # Check if the column is object
        X[col] = le.fit_transform(X[col])  # Fit and transform


In [None]:
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, y)

In [None]:
# Get the first split attribute
first_split_attribute = X.columns[clf.tree_.feature[0]]
print("First attribute chosen by DecisionTreeClassifier:", first_split_attribute)

First attribute chosen by DecisionTreeClassifier: ALLERGY 


In [None]:
df2=pd.read_csv('https://raw.githubusercontent.com/jhbellingrath/CS-290-Classwork/refs/heads/main/data/Housing_Prices.csv')

In [None]:
df2.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built
0,376000.0,3,2.0,1340,1384,3.0,0,0,3,1340,0,2008
1,800000.0,4,3.25,3540,159430,2.0,0,0,3,3540,0,2007
2,2238888.0,5,6.5,7270,130017,2.0,0,0,3,6420,850,2010
3,324000.0,3,2.25,998,904,2.0,0,0,3,798,200,2007
4,549900.0,5,2.75,3060,7015,1.0,0,0,5,1600,1460,1979
