In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
dataset = pd.read_csv('cancer_reg.csv', encoding='latin-1')

Check for missing values

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried          

Handle missing values

In [4]:
#remove PctSomeCol18_24 column because numver of missing value for this column is large
dataset = dataset.drop(columns=['PctSomeCol18_24'])

In [5]:
#replace null value with mean of correspoding column
dataset.fillna(dataset.mean(), inplace=True)

Get Average of binnedInc

In [6]:
# Use regular expression to remove ( and ] 
dataset['binnedInc'] = dataset['binnedInc'].apply(lambda x: re.sub('[^0-9a-zA-Z:,.]+', '', x))
samples = dataset['binnedInc'].str.split(',',expand=True).astype(float)
avg = (samples[0] + samples[1])/2
dataset['avgbinnedInc'] = avg
dataset.drop(columns=['binnedInc'], inplace=True)

Feature Engineering

In [7]:
#Split the Geography column to two different column - County and State
dataset['County'] = dataset['Geography'].apply(lambda x: x.split(',')[0])
dataset['State'] = dataset['Geography'].apply(lambda x: x.split(',')[1])
dataset.drop(columns=['Geography'], inplace=True)

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   MedianAge                3047 non-null   float64
 9   MedianAgeMale            3047 non-null   float64
 10  MedianAgeFemale          3047 non-null   float64
 11  AvgHouseholdSize         3047 non-null   float64
 12  PercentMarried           3047 non-null   float64
 13  PctNoHS18_24             3047 non-null   float64
 14  PctHS18_24              

In [9]:
dataset = dataset.set_index('County')

In [10]:
dataset.State.unique

<bound method Series.unique of County
Kitsap County        Washington
Kittitas County      Washington
Klickitat County     Washington
Lewis County         Washington
Lincoln County       Washington
                       ...     
Ellsworth County         Kansas
Finney County            Kansas
Ford County              Kansas
Franklin County          Kansas
Geary County             Kansas
Name: State, Length: 3047, dtype: object>

In [None]:
X = dataset.loc[: , dataset.columns != 'TARGET_deathRate'].values

In [None]:
Y = dataset.iloc[:, 2:3].values

In [None]:
X.shape

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [31])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
X

In [None]:
X.shape