# Explainable AI Cancer Data Set Example 
The book that this notebook is implementing is available at: https://christophm.github.io/interpretable-ml-book/

# Import Relevant libraries

In [1]:
#importing pandas to read the CSV file. The CSV file can be downloaded from the link UCI- Machine Learning Laboratory 
#The link to download the data is https://archive.ics.uci.edu/ml/datasets/Cervical+cancer+%28Risk+Factors%29
#Download the data set and 
# put it in the same folder as this file 
import pandas as pd 
import tensorflow as tf 
from tensorflow import keras
from sklearn import linear_model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

# Data Properties:
The cervical cancer dataset contains indicators and risk factors for predicting whether a woman will get cervical cancer. The features include demographic data (such as age), lifestyle, and medical history. 
The subset of data features used in the book's examples are:

- Age in years
- Number of sexual partners
- First sexual intercourse (age in years)
- Number of pregnancies
- Smoking yes or no
- Smoking (in years)
- Hormonal contraceptives yes or no
- Hormonal contraceptives (in years)
- Intrauterine device yes or no (IUD)
- Number of years with an intrauterine device (IUD)
- Has patient ever had a sexually transmitted disease (STD) yes or no
- Number of STD diagnoses
- Time since first STD diagnosis
- Time since last STD diagnosis
- The biopsy results "Healthy" or "Cancer". Target outcome.

In [23]:
#importing data into a data frame
cancer_data_pd=pd.read_csv('https://github.com/christophM/interpretable-ml-book/blob/master/data/cervical.csv')
cancer_data_pd.shape

ParserError: Error tokenizing data. C error: Expected 1 fields in line 47, saw 2


In [None]:
#displaying feature names 
cancer_data_pd.all()

In [4]:
#features used by the book
feature_names=['Age','Number of sexual partners','Num of pregnancies','Smokes','Smokes (years)','Hormonal Contraceptives','Hormonal Contraceptives (years)','IUD','STDs','STDs (number)','STDs: Time since first diagnosis','STDs: Time since last diagnosis','Biopsy']

In [5]:
#displaying the book features
feature_names

['Age',
 'Number of sexual partners',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'STDs',
 'STDs (number)',
 'STDs: Time since first diagnosis',
 'STDs: Time since last diagnosis',
 'Biopsy']

In [6]:
#creating a new pandas data frame 
cancer_data_book_pd=cancer_data_pd[feature_names]
cancer_data_book_np=cancer_data_book_pd.to_numpy()
cancer_data_book_np.shape

(858, 13)

# Making a Logistic Regression Model
- Define Model 
- Split Data 
- Apply Model 
- Check accracy 

In [23]:
#defining a regressor
reg= LogisticRegression
#predicting count based on all other features 
#Converting the df into a numpy array 
#X represents the input features 
#Y represents the Ouput features 


#Spliting into input and output features

X=cancer_data_book_np[:,:12]
Y=cancer_data_book_np[:,12]
#X=normalize(X)
#Splitting the data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
#fitting the model 
reg.fit(X,Y)
#Predicting the values 
y_pred = reg.predict(X_test)
#As it is already intrepretable we just look at the weights within the model




TypeError: fit() missing 1 required positional argument: 'y'

In [19]:
cancer_data_book_np[:,1]

array(['4.0', '1.0', '1.0', '5.0', '3.0', '3.0', '3.0', '1.0', '1.0',
       '3.0', '3.0', '1.0', '4.0', '2.0', '2.0', '3.0', '4.0', '3.0',
       '2.0', '2.0', '2.0', '3.0', '1.0', '1.0', '3.0', '3.0', '5.0',
       '2.0', '3.0', '6.0', '3.0', '2.0', '3.0', '3.0', '2.0', '3.0',
       '2.0', '?', '1.0', '2.0', '?', '2.0', '3.0', '3.0', '3.0', '1.0',
       '2.0', '2.0', '2.0', '3.0', '3.0', '2.0', '5.0', '3.0', '1.0',
       '4.0', '5.0', '3.0', '2.0', '3.0', '3.0', '3.0', '1.0', '1.0',
       '2.0', '3.0', '3.0', '4.0', '3.0', '5.0', '4.0', '?', '2.0', '2.0',
       '2.0', '2.0', '5.0', '3.0', '3.0', '2.0', '1.0', '3.0', '2.0',
       '3.0', '2.0', '1.0', '2.0', '5.0', '3.0', '1.0', '1.0', '3.0',
       '4.0', '1.0', '3.0', '4.0', '5.0', '1.0', '1.0', '3.0', '3.0',
       '5.0', '3.0', '3.0', '2.0', '4.0', '4.0', '5.0', '2.0', '2.0',
       '3.0', '7.0', '3.0', '5.0', '3.0', '3.0', '3.0', '3.0', '2.0',
       '3.0', '1.0', '5.0', '3.0', '5.0', '3.0', '3.0', '2.0', '2.0',
       '1.0'