In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.offline as pyo
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)  

In [2]:
pd.set_option('display.max_columns', 50)

In [3]:
raw_cancer = pd.read_csv('../data/data_cancer.csv')

In [4]:
raw_cancer.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
69,859487,B,12.78,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.159,0.05653,0.2368,0.8732,1.471,18.33,0.007962,0.005612,0.01585,0.008662,0.02254,0.001906,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.0641
348,898690,B,11.47,16.03,73.02,402.7,0.09076,0.05886,0.02587,0.02322,0.1634,0.06372,0.1707,0.7615,1.09,12.25,0.009191,0.008548,0.0094,0.006315,0.01755,0.003009,12.51,20.79,79.67,475.8,0.1531,0.112,0.09823,0.06548,0.2851,0.08763
526,91813701,B,13.46,18.75,87.44,551.1,0.1075,0.1138,0.04201,0.03152,0.1723,0.06317,0.1998,0.6068,1.443,16.07,0.004413,0.01443,0.01509,0.007369,0.01354,0.001787,15.35,25.16,101.9,719.8,0.1624,0.3124,0.2654,0.1427,0.3518,0.08665
407,905190,B,12.85,21.37,82.63,514.5,0.07551,0.08316,0.06126,0.01867,0.158,0.06114,0.4993,1.798,2.552,41.24,0.006011,0.0448,0.05175,0.01341,0.02669,0.007731,14.4,27.01,91.63,645.8,0.09402,0.1936,0.1838,0.05601,0.2488,0.08151
273,8910996,B,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,0.06312,0.2684,1.409,1.75,16.39,0.0138,0.01067,0.008347,0.009472,0.01798,0.004261,10.75,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175


In [5]:
def summary_statistics_num(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a summary statistics for numerical features in dataframe for the input dataframe
    """
    feature_info = pd.DataFrame({
        'feature': df.columns,
        'dtype': df.dtypes,
        'num_unique': df.nunique(),
        'num_Nan': df.isna().sum(),
        'count': df.count()
    })

    feature_describe = df.describe().T.drop(columns='count').reset_index().rename(columns={'index': 'feature'})

    summary_statistics_result = pd.merge(feature_info, feature_describe, on='feature')

    return summary_statistics_result

In [7]:
def summary_statistics_obj(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a summary statistics for non-numerical features in dataframe for the input dataframe
    """
    feature_info = pd.DataFrame({
        'feature': df.columns,
        'dtype': df.dtypes,
        'num_unique': df.nunique(),
        'num_nan': df.isna().sum(),
    })

    non_numeric_features = df.select_dtypes(include=['object', 'bool']).columns
    feature_describe_non_numeric = df[non_numeric_features].describe().T.reset_index().rename(columns={'index': 'feature'})

    summary_statistics_result = pd.merge(feature_info, feature_describe_non_numeric, on='feature', how='right',)

    return summary_statistics_result

In [None]:
raw_cancer.columns

In [8]:
summary_statistics_num(raw_cancer)

Unnamed: 0,feature,dtype,num_unique,num_Nan,count,mean,std,min,25%,50%,75%,max
0,id,int64,569,0,569,30371830.0,125020600.0,8670.0,869218.0,906024.0,8813129.0,911320500.0
1,radius_mean,float64,456,0,569,14.12729,3.524049,6.981,11.7,13.37,15.78,28.11
2,texture_mean,float64,479,0,569,19.28965,4.301036,9.71,16.17,18.84,21.8,39.28
3,perimeter_mean,float64,522,0,569,91.96903,24.29898,43.79,75.17,86.24,104.1,188.5
4,area_mean,float64,539,0,569,654.8891,351.9141,143.5,420.3,551.1,782.7,2501.0
5,smoothness_mean,float64,474,0,569,0.09636028,0.01406413,0.05263,0.08637,0.09587,0.1053,0.1634
6,compactness_mean,float64,537,0,569,0.104341,0.05281276,0.01938,0.06492,0.09263,0.1304,0.3454
7,concavity_mean,float64,537,0,569,0.08879932,0.07971981,0.0,0.02956,0.06154,0.1307,0.4268
8,concave points_mean,float64,542,0,569,0.04891915,0.03880284,0.0,0.02031,0.0335,0.074,0.2012
9,symmetry_mean,float64,432,0,569,0.1811619,0.02741428,0.106,0.1619,0.1792,0.1957,0.304


In [9]:
summary_statistics_obj(raw_cancer)

Unnamed: 0,feature,dtype,num_unique,num_nan,count,unique,top,freq
0,diagnosis,object,2,0,569,2,B,357


In [None]:
raw_cancer.shape

In [10]:
cancer = raw_cancer.copy()
cancer.diagnosis.replace({'M':1,'B':0}, inplace=True)

In [11]:
M = cancer[(cancer['diagnosis'] != 0)]
B = cancer[(cancer['diagnosis'] == 0)]
trace = go.Bar(y=(len(M), len(B)), x=['malignant', 'benign'], opacity=0.8, marker=dict(
        color=['red', 'lightskyblue'],
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  'Count of diagnosis variable')
                    
fig = dict(data = [trace], layout=layout)
pyo.iplot(fig)

#------------PERCENTAGE-------------------
trace = go.Pie(labels=['benign','malignant'], values=cancer['diagnosis'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['lightskyblue', 'red'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of diagnosis variable')
           
fig = dict(data = [trace], layout=layout)
pyo.iplot(fig)

In [12]:
def plot_corr_heatmap(dataframe):
    correlation = dataframe.corr()
    matrix_cols = correlation.columns.tolist()
    corr_array  = np.array(correlation)
    trace = go.Heatmap(z=corr_array,
                       x=matrix_cols,
                       y=matrix_cols,
                       xgap=2,
                       ygap=2,
                       colorscale='RdBu_r',
                       colorbar=dict() ,
                      )
    layout = go.Layout(dict(title='Correlation Matrix for variables',
                            autosize=False,
                            height=720,
                            width=800,
                            margin=dict(r=20, l=150,
                                        t=50, b=150,
                                      ),
                            yaxis=dict(tickfont=dict(size=9)),
                            xaxis=dict(tickfont=dict(size=9)),
                           )
                      )
    fig = go.Figure(data=[trace], layout=layout)
    return pyo.iplot(fig)

In [13]:
plot_corr_heatmap(cancer.drop('id', axis=1))

In [None]:
cancer.sample(5)

In [None]:
X = cancer.drop(columns=['id', 'diagnosis'])
y = cancer.diagnosis

In [None]:
cancer.diagnosis.value_counts()

In [None]:
# implementing Naive Bayes Classifier from scratch
class CustumNaiveBayesClassifier:
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.num_classes = len(self.classes)
        self.priors = np.zeros(self.num_classes)
        self.means = np.zeros((self.num_classes, X.shape[1]))
        self.variances = np.zeros((self.num_classes, X.shape[1]))
        
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.priors[i] = X_c.shape[0] / X.shape[0]
            self.means[i] = X_c.mean(axis=0)
            self.variances[i] = X_c.var(axis=0)
            
    def predict(self, X):
        posteriors = np.zeros((X.shape[0], self.num_classes))
        for i, c in enumerate(self.classes):
            prior = np.log(self.priors[i])
            posterior = np.sum(np.log(self._gaussian_pdf(X, self.means[i], self.variances[i])), axis=1)
            posteriors[:, i] = prior + posterior
        return self.classes[np.argmax(posteriors, axis=1)]
    
    def _gaussian_pdf(self, X, mean, variance):
        return (1 / np.sqrt(2 * np.pi * variance)) * np.exp(-((X - mean) ** 2) / (2 * variance))
    

In [None]:

class CustomKNN:
    def __init__(self, k=5):
        self.k = k
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        distances = self._compute_distances(X)
        indices = np.argsort(distances, axis=1)[:, :self.k]
        y_pred = np.zeros(X.shape[0], dtype=self.y_train.dtype)
        
        for i in range(X.shape[0]):
            y_pred[i] = np.bincount(self.y_train[indices[i]]).argmax()
        
        return y_pred
    
    def _compute_distances(self, X):
        dists = np.sqrt(np.sum((X[:, np.newaxis] - self.X_train) ** 2, axis=2))
        return dists

In [None]:
# PCA Implementation from scratch like sklearn

class CustomPCA:
    def __init__(self, n_components):
        self.n_components = n_components
    
    def fit(self, X):
        # Center the data
        self.mean_ = np.mean(X, axis=0)
        X_centered = X - self.mean_
        
        # Compute the covariance matrix
        cov_matrix = np.cov(X_centered.T)
        
        # Compute the eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
        
        # Sort the eigenvalues and eigenvectors in descending order
        idx = eigenvalues.argsort()[::-1]
        self.components_ = eigenvectors[:, idx][:, :self.n_components]
    
    def transform(self, X):
        X_centered = X - self.mean_
        X_transformed = np.dot(X_centered, self.components_)
        
        return X_transformed


In [None]:
import time

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.feature_selection import (SelectKBest, SequentialFeatureSelector,
                                       f_classif)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, auc, classification_report,
                             confusion_matrix, f1_score, mean_squared_error,
                             plot_confusion_matrix, precision_recall_curve,
                             precision_recall_fscore_support, precision_score,
                             recall_score, roc_auc_score, roc_curve)
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from xgboost import XGBClassifier



# Load the dataset
raw_cancer = pd.read_csv('../data/data_cancer.csv')
cancer = raw_cancer.copy()


# Split the dataset into features and target variable
X = cancer.drop(columns=['id', 'diagnosis']).values
y = cancer.diagnosis.values

#setting the seed
np.random.seed(0)

# Encode the target variable as binary labels
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the feature selection method
feature_selection = SelectKBest(f_classif)

# Define the PCA method
pca = PCA()

# sfs = SequentialFeatureSelector(
#                                 pipeline, 
#                                 k_features='auto', 
#                                 forward=True, 
#                                 floating=False,
#                                 cv=5
#                             )

# Define the classifiers and their hyperparameters to tune
classifiers = {
    'Custom KNN': (CustomKNN(), {}),
    'CustumNaiveBayes Classifier': (CustumNaiveBayesClassifier(), {}),
    'Logistic Regression': (LogisticRegression(random_state=0), {
        'classifier__C': [0.01, 0.02],
        'classifier__penalty': ['l2']
    }),
    'Decision Tree': (DecisionTreeClassifier(random_state=0), {
        'classifier__max_depth': [3, 5],
        'classifier__min_samples_split': [2, 3],
        'classifier__min_samples_leaf': [1, 2]
    }),
    'Random Forest': (RandomForestClassifier(random_state=0), {
        'classifier__n_estimators': [10, 20],
        'classifier__max_depth': [3, 5],
        'classifier__min_samples_split': [2, 3],
        'classifier__min_samples_leaf': [1, 2]
    }),
    'SVM': (SVC(random_state=0, probability=True), {
        'classifier__C': [0.01, 0.1, 1],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__degree':[3, 5]
    }),
    'Naive Bayes': (GaussianNB(), {}),
    'KNN': (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5],
        'classifier__weights':['uniform']
    }),
    'AdaBoost': (AdaBoostClassifier(random_state=0), {
        'classifier__n_estimators': [10, 20],
        'classifier__learning_rate':[1, 2]
    }),
    'Gradient Boosting': (GradientBoostingClassifier(random_state=0), {
        'classifier__n_estimators': [10, 20],
        'classifier__learning_rate':[1, 2]
    }),
    'XGBoost': (XGBClassifier(random_state=0), {
        'classifier__n_estimators':[10, 20],
        'classifier__learning_rate':[0.1, 0.2],
        'classifier__max_depth':[3, 5]
    }),
}

# Create a dictionary to store the evaluation metrics for each classifier
metrics = {}
best_params = {}

# Compare the classifiers
for name,(classifier, params) in classifiers.items():
    
    # Create a pipeline with feature selection and the classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', feature_selection),
        ('pca', pca),
        ('classifier', classifier)
    ])
    
    # Define the hyperparameters to tune for the pipeline
    params.update({
        'feature_selection__k':[7, 8, 9, 10],
        'pca__n_components':[3, 4, 5, 6, 7]
    })
    
    # Perform a grid search to find the best hyperparameters for the pipeline
    grid_search = GridSearchCV(pipeline, params)
    
    grid_search.fit(X_train,y_train)
    
    # Store the best hyperparameters for the classifier
    best_params[name] = grid_search.best_params_

    # Print the best hyperparameters for the pipeline
    print(f'{name} Best Hyperparameters: {grid_search.best_params_}')
    
    # Make predictions on the test set using the best hyperparameters
    y_pred = grid_search.predict(X_test)
    
    # Calculate the evaluation metrics for the model with the best hyperparameters
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, grid_search.predict_proba(X_test)[:,1])
    mse = mean_squared_error(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    # precision_recall_fscore_support = precision_recall_fscore_support(y_test,y_pred)
    
    # Store the evaluation metrics for the classifier
    metrics[name] = {
        'Accuracy': accuracy,
        'ROC AUC': roc_auc,
        'MSE': mse,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1,
        # 'Precision Recall Fscore Support': precision_recall_fscore_support
    }

cross_val_scores = {}

# Perform 10-fold cross-validation using the best hyperparameters for each classifier
for name, (classifier, params) in classifiers.items():
    
    # Create a pipeline with feature selection and the classifier using the best hyperparameters
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', feature_selection),
        ('pca', pca),
        ('classifier', classifier)
    ])
    
    pipeline.set_params(**best_params[name])
    
    # Measure the time it takes to perform 10-fold cross-validation on the pipeline
    start_time = time.time()

    # Perform 10-fold cross-validation on the pipeline
    scores = cross_val_score(pipeline, X, y, cv=10)
    
    end_time = time.time()
    
    cross_val_scores[name] = {
        'Scores Mean': scores.mean(),
        'Scores Standard Deviation': scores.std(),
        'Time': end_time - start_time
        }
    # Print the mean and standard deviation of the cross-validation scores

# Create a DataFrame to display the evaluation metrics for each classifier
metrics_df = pd.DataFrame(metrics).T
print(metrics_df)

cross_val_df = pd.DataFrame(cross_val_scores).T
print(cross_val_df)