In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')

>> #### Import Libraries

In [13]:
import sys
import os

import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, precision_recall_curve, auc
from plotly.subplots import make_subplots
import itertools
# Run the following two lines of code for Uncaught Error: Script error for plotly
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

>> #### Import required modules

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from load_data import LoadData
from plot import Plot
from overview import Overview
from preprocessing import PreProcess

Initialize load data

In [4]:
loader = LoadData()
plot = Plot()

Load data from csv file using DVC

In [None]:
# Get URL from DVC
data_path = '../data/data.csv'
repo = 'https://github.com/jedisam/casual_inference/'
version = 'bee80042c3c966ec98d90c247a8ccac0e5fd8aca'

In [None]:
# Load train data from dvc using the dvc.api.Dataset class
data_url = dvc.api.get_url(
    path=data_path,
    repo=repo,
    rev=version
)

In [5]:
data = loader.read_csv("../data/data_clean.csv")

In [6]:
overview = Overview(data)
prep = PreProcess(data)

In [9]:
data.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.22862,0.28241,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.62695,0.7119,0.2654,0.41915,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.22862,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.62695,0.6869,0.2575,0.41915,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [10]:
data.shape
# number of data points
print(f" There are {data.shape[0]} rows and {data.shape[1]} columns")

 There are 569 rows and 31 columns


In [11]:
# Count missing values
overview.percent_missing(data)

The dataset contains 0.0 % missing values.


In [14]:
data.columns

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'diagnosis'],
      dtype='object')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data['diagnosis'], test_size = 0.2, random_state = 42)

>> #### Scale the data

In [17]:
scale = RobustScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)