In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score

In [2]:
dataset = pd.read_csv('cancer_reg.csv', encoding='latin-1')

## Check for missing values

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried          

## Handle missing values

In [4]:
#remove PctSomeCol18_24 column because numver of missing value for this column is large
dataset = dataset.drop(columns=['PctSomeCol18_24'])

In [5]:
#replace null value with mean of correspoding column
dataset.fillna(dataset.mean(), inplace=True)

Get Average of binnedInc

In [6]:
# Use regular expression to remove ( and ] 
dataset['binnedInc'] = dataset['binnedInc'].apply(lambda x: re.sub('[^0-9a-zA-Z:,.]+', '', x))
samples = dataset['binnedInc'].str.split(',',expand=True).astype(float)
avg = (samples[0] + samples[1])/2
dataset['avgbinnedInc'] = avg
dataset.drop(columns=['binnedInc'], inplace=True)

## Feature Engineering

In [7]:
#Split the Geography column to two different column - County and State and remove the County Column as all values in this feature is different
dataset['County'] = dataset['Geography'].apply(lambda x: x.split(',')[0])
dataset['State'] = dataset['Geography'].apply(lambda x: x.split(',')[1])
dataset.drop(columns=['Geography', 'County'], inplace=True)

In [8]:
X = dataset.loc[: , dataset.columns != 'TARGET_deathRate'].values
y = dataset.iloc[:, 2:3].values.flatten()

In [9]:
X

array([[1397.0, 469, 489.8, ..., 6.118831029, 93564.75, ' Washington'],
       [173.0, 70, 411.6, ..., 4.333095578, 49534.0, ' Washington'],
       [102.0, 50, 349.7, ..., 3.729487817, 49534.0, ' Washington'],
       ...,
       [1962.667684, 46, 453.5494221, ..., 7.80919168, 52796.0,
        ' Kansas'],
       [1962.667684, 52, 453.5494221, ..., 7.582938389, 49534.0,
        ' Kansas'],
       [1962.667684, 48, 453.5494221, ..., 8.981723237999999, 41543.55,
        ' Kansas']], dtype=object)

In [10]:
y

array([164.9, 161.3, 174.7, ..., 153.9, 175. , 213.6])

## Encoding categorical data

In [11]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [31])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Splitting the dataset into the Training set and Test set

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Multiple Linear Regression model

In [13]:
#training the model
lnr_regressor = LinearRegression()
lnr_regressor.fit(X_train, y_train)

#predicting values using the model
y_pred_lnr =  lnr_regressor.predict(X_test)

#evalute the preformance using r2 score
r2_score(y_test,y_pred_lnr)

0.6011030729412618

## Decision tree regression

In [14]:
# training the model
tree_regressor = DecisionTreeRegressor( max_depth=6,)
tree_regressor.fit(X_train, y_train)

#predicting values using the model
y_pred_tree = tree_regressor.predict(X_test)

#evalute the preformance using r2 score
r2_score(y_test,y_pred_tree)

0.4488036098256478

## Random forest regression

In [15]:
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor(n_estimators = 40, random_state = 0)
forest_regressor.fit(X_train, y_train)

#predicting values using the model
y_pred_forest = forest_regressor.predict(X_test)

#evalute the preformance using r2 score
r2_score(y_test,y_pred_forest)

0.5783966446005749

## Support Vector Regression (SVR)


In [16]:
# feature scaling
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train_svr = X_train.copy()
X_test_svr = X_test.copy()
y_train_svr = y_train.reshape(len(y_train),1).copy()
y_test_svr = y_test.reshape(len(y_test),1).copy()
X_train_svr[:, 51:] = sc_X.fit_transform(X_train_svr[:, 51:])
X_test_svr[:, 51:] = sc_X.transform(X_test_svr[:, 51:])
y_train_svr = sc_y.fit_transform(y_train_svr)
y_test_svr = sc_y.transform(y_test_svr)

# training the model
svr_regressor = SVR(kernel = 'rbf')
svr_regressor.fit(X_train_svr, y_train_svr)

#predicting values using the model
y_pred_svr = svr_regressor.predict(X_test_svr)

#evalute the preformance using r2 score
r2_score(y_test, sc_y.inverse_transform(y_pred_svr))

  return f(*args, **kwargs)


0.5253980525435493

### R2 score is used for selecting the best model. From the above experiment, we conclude that multiple linear regression is fitting best with R2 score of 0.6 (which can be improved by doing more feature engineering and tuning the model)