In [72]:
import numpy as np
import pandas as pd

In [73]:
df=pd.read_csv('cancer_data.csv')

PRE-PROCESSING

In [74]:
df.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [75]:
df.drop('id',axis=1,inplace=True)

In [76]:
df['diagnosis']=pd.get_dummies(df['diagnosis'],drop_first=True)

In [77]:
X=df.drop('diagnosis',axis=1)
y=df['diagnosis']

In [78]:
from sklearn.preprocessing import MinMaxScaler
num_cols=df.select_dtypes(include=['int64','float64','int32']).columns

for col in num_cols:
    fill_value=df[col].mean()
    X[col].fillna(fill_value,inplace=True)

minmax=MinMaxScaler()
X[num_cols]=minmax.fit_transform(df[num_cols])

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Decision Tree


In [80]:
class Node:
    def __init__(self,feature=None,threshold=None,data_left=None,data_right=None,depth=None,gain=None,value=None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value

In [81]:
from collections import Counter

In [82]:
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=5):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None
        
    @staticmethod
    def _entropy(s):
        counts = np.bincount(np.array(s, dtype=np.int64))
        percentages = counts / len(s)

        entropy = 0
        for pct in percentages:
            if pct > 0:
                entropy += pct * np.log2(pct)
        return -entropy
    
    def _information_gain(self, parent, left_child, right_child):
        num_left = len(left_child) / len(parent)
        num_right = len(right_child) / len(parent)
        
        return self._entropy(parent) - (num_left * self._entropy(left_child) + num_right * self._entropy(right_child))
    
    def _best_split(self, X, y):
        best_split = {}
        best_info_gain = -1
        n_rows, n_cols = X.shape
        
        
        for f_idx in range(n_cols):
            X_curr = X[:, f_idx]
            
            for threshold in np.unique(X_curr):
                
                df = np.concatenate((X, y.reshape(1, -1).T), axis=1)
                df_left = np.array([row for row in df if row[f_idx] <= threshold])
                df_right = np.array([row for row in df if row[f_idx] > threshold])

                
                if len(df_left) > 0 and len(df_right) > 0:
                    
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]

                    
                    gain = self._information_gain(y, y_left, y_right)
                    if gain > best_info_gain:
                        best_split = {
                            'feature_index': f_idx,
                            'threshold': threshold,
                            'df_left': df_left,
                            'df_right': df_right,
                            'gain': gain
                        }
                        best_info_gain = gain
        return best_split
    
    def _build(self, X, y, depth=0):
        
        n_rows, n_cols = X.shape
        
        
        if n_rows >= self.min_samples_split and depth <= self.max_depth:
            
            best = self._best_split(X, y)
            
            if best['gain'] > 0:
               
                left = self._build(
                    X=best['df_left'][:, :-1], 
                    y=best['df_left'][:, -1], 
                    depth=depth + 1
                )
                right = self._build(
                    X=best['df_right'][:, :-1], 
                    y=best['df_right'][:, -1], 
                    depth=depth + 1
                )
                return Node(
                    feature=best['feature_index'], 
                    threshold=best['threshold'], 
                    data_left=left, 
                    data_right=right, 
                    gain=best['gain']
                )
        
        return Node(
            value=Counter(y).most_common(1)[0][0]
        )
    
    def fit(self, X, y):
        
        self.root = self._build(X, y)
        
    def _predict(self, x, tree):
        
        if tree.value != None:
            return tree.value
        feature_value = x[tree.feature]
        
        
        if feature_value <= tree.threshold:
            return self._predict(x=x, tree=tree.data_left)
        
        
        if feature_value > tree.threshold:
            return self._predict(x=x, tree=tree.data_right)
        
    def predict(self, X):
        return [self._predict(x, self.root) for x in X]

In [97]:
X_np=X.to_numpy()
y_np=y.to_numpy()

In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

In [85]:
model = DecisionTree()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [69]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,preds)
print(accuracy)

0.9298245614035088


SVM

In [100]:
class SVM:
    def __init__(self,C=1.0):
        self.C=C
        self.w=0
        self.b=0

    def hingeloss(self,w,b,x,y):
        reg=0.5 * (w*w)
        for i in range(x.shape[0]):
            opt_term = y[i] * ((np.dot(w, x[i])) + b)
            loss= reg+ self.C + max(0,1-opt_term)
        return loss[0][0]
    
    def fit(self,X,y,batch_size=100,lr=0.001,epochs=1000):
        n_features=X.shape[1]
        n_samples=X.shape[0]
        c=self.C
        ids=np.arange(n_samples)
        np.random.shuffle(ids)
        w=np.zeros((1,n_features))
        b=0
        losses=[]

        for i in range(epochs):
            l=self.hingeloss(w,b,X,y)
            losses.append(l)
            for batch_initial in range(0,n_samples,batch_size):
                gradw=0
                gradb=0

                for j in range(batch_initial,batch_initial+batch_size):
                    if j< n_samples:
                        x=ids[j]
                        ti=y[x]* (np.dot(w,X[x].T) +b)

                        if ti>1:
                            gradw+=0
                            gradb+=0
                        
                        else: 
                            gradw += c * y[x] * X[x]
                            gradb += c * y[x]
            
                w = w - lr * w + lr * gradw
                b = b + lr * gradb
        
        self.w = w
        self.b = b
        return self.w, self.b, losses  
    
    def predict(self, X):
        
        prediction = np.dot(X, self.w[0]) + self.b 
        return np.sign(prediction)

In [101]:
svm = SVM()
w, b, losses = svm.fit(X_train, y_train)

In [103]:
predictions=svm.predict(X_test)

In [104]:
print(accuracy_score(y_test,predictions))

0.37719298245614036
