# IMPORT Libraries

In [260]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# READ CSV

In [261]:
my_data = pd.read_csv("Breast_Cancer.csv", delimiter=",")
my_data[0:5]

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


# Gain some Information

In [262]:
my_data['Status'].unique()

array(['Alive', 'Dead'], dtype=object)

In [263]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

# Create DataFrame

In [264]:
df=pd.DataFrame(my_data)

# Information about quantity of values for each column

In [265]:
for col in df.columns:
    if not pd.api.types.is_integer_dtype(df[col]):  # Check if column is not integer type
        print(f"Value counts for {col}:")
        print(df[col].value_counts())
        print()  # Print an empty line for separation

Value counts for Race:
White    3413
Other     320
Black     291
Name: Race, dtype: int64

Value counts for Marital Status:
Married      2643
Single        615
Divorced      486
Widowed       235
Separated      45
Name: Marital Status, dtype: int64

Value counts for T Stage :
T2    1786
T1    1603
T3     533
T4     102
Name: T Stage , dtype: int64

Value counts for N Stage:
N1    2732
N2     820
N3     472
Name: N Stage, dtype: int64

Value counts for 6th Stage:
IIA     1305
IIB     1130
IIIA    1050
IIIC     472
IIIB      67
Name: 6th Stage, dtype: int64

Value counts for differentiate:
Moderately differentiated    2351
Poorly differentiated        1111
Well differentiated           543
Undifferentiated               19
Name: differentiate, dtype: int64

Value counts for Grade:
2                        2351
3                        1111
1                         543
 anaplastic; Grade IV      19
Name: Grade, dtype: int64

Value counts for A Stage:
Regional    3932
Distant       92
Nam

# Gain knowledge about numerical features

In [266]:
numerical_columns = df.select_dtypes(include=['int', 'float']).columns

# Describe numerical columns
numerical_stats = df[numerical_columns].describe()

print("Descriptive statistics for numerical columns:")
print(numerical_stats)

Descriptive statistics for numerical columns:
               Age   Tumor Size  Regional Node Examined  \
count  4024.000000  4024.000000             4024.000000   
mean     53.972167    30.473658               14.357107   
std       8.963134    21.119696                8.099675   
min      30.000000     1.000000                1.000000   
25%      47.000000    16.000000                9.000000   
50%      54.000000    25.000000               14.000000   
75%      61.000000    38.000000               19.000000   
max      69.000000   140.000000               61.000000   

       Reginol Node Positive  Survival Months  
count            4024.000000      4024.000000  
mean                4.158052        71.297962  
std                 5.109331        22.921430  
min                 1.000000         1.000000  
25%                 1.000000        56.000000  
50%                 2.000000        73.000000  
75%                 5.000000        90.000000  
max                46.000000       107

# Feature selection by Descision Trees feature importance

### Selection in numerical features

In [267]:
selected_columns = ['Age', 'Tumor Size', 'Regional Node Examined','Survival Months','Reginol Node Positive',]  # Replace with your actual feature names
# Separate features and target
X = df[selected_columns]
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [268]:
# Get feature importances
importances = clf.feature_importances_
importance_series = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("Feature Importances from Decision Tree:")
print(importance_series)


Feature Importances from Decision Tree:
Survival Months           0.468858
Age                       0.154544
Tumor Size                0.152395
Regional Node Examined    0.130100
Reginol Node Positive     0.094103
dtype: float64


## "Regional Node Examined" and "Reginol Node Positive" should be dropped

### Selection in categorical features

In [269]:
categorical_columns = ['Status','Marital Status', 'differentiate','Race','T Stage ','Progesterone Status','N Stage','6th Stage','Grade','A Stage','Estrogen Status']  # Your actual feature names
X_columns = ['Marital Status', 'differentiate','Race','T Stage ','Progesterone Status','N Stage','6th Stage','Grade','A Stage','Estrogen Status']  # Your actual feature names

# Encode categorical features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
                

In [270]:
X = df[X_columns]
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [271]:
# Get feature importances
importances = clf.feature_importances_
importance_series = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("Feature Importances from Decision Tree:")
print(importance_series)


Feature Importances from Decision Tree:
Marital Status         0.218051
6th Stage              0.199337
T Stage                0.151596
Race                   0.090615
Estrogen Status        0.079766
Progesterone Status    0.077521
Grade                  0.070915
N Stage                0.049233
differentiate          0.032803
A Stage                0.030164
dtype: float64


## "Marital Status" and "6th stage" are selected

# Selecting top features
## There are 3 numerical, 1 categorical, and 1 binary columns

In [272]:
columns_to_drop=['T Stage ','N Stage','differentiate','Race','Grade','A Stage','Estrogen Status','Progesterone Status','Reginol Node Positive','Regional Node Examined']
df.drop(columns=columns_to_drop, inplace=True)  # Use inplace=True to modify the original DataFrame


In [273]:
df

Unnamed: 0,Age,Marital Status,6th Stage,Tumor Size,Survival Months,Status
0,68,1,0,4,60,0
1,50,1,2,35,62,0
2,58,0,4,63,75,0
3,58,1,0,18,84,0
4,47,1,1,41,50,0
...,...,...,...,...,...,...
4019,62,1,0,9,49,0
4020,56,0,2,46,69,0
4021,68,1,1,22,69,0
4022,58,0,1,44,72,0


# Categorizing Numerical features

In [274]:
def entropy(y):
    unique_classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

# Function to calculate information gain
def information_gain(y, split_point, feature_values):
    left_indices = feature_values <= split_point
    right_indices = feature_values > split_point
    left_entropy = entropy(y[left_indices])
    right_entropy = entropy(y[right_indices])
    p_left = len(y[left_indices]) / len(y)
    p_right = len(y[right_indices]) / len(y)
    return entropy(y) - (p_left * left_entropy + p_right * right_entropy)

# Function to find the best split point for a numerical feature
def find_best_split_point(y, feature_values):
    unique_values = np.sort(np.unique(feature_values))
    best_split = None
    best_gain = -1
    for i in range(len(unique_values) - 1):
        split_point = (unique_values[i] + unique_values[i + 1]) / 2
        gain = information_gain(y, split_point, feature_values)
        if gain > best_gain:
            best_gain = gain
            best_split = split_point
    return best_split, best_gain

# Function to categorize a numerical feature
def categorize_numerical_feature(df, feature_name, target_name):
    feature_values = df[feature_name].values
    target_values = df[target_name].values
    split_point, _ = find_best_split_point(target_values, feature_values)
    df[f'{feature_name}_categorized'] = pd.cut(feature_values, bins=[-np.inf, split_point, np.inf], labels=[f'<= {split_point}', f'> {split_point}'])
    return df


In [275]:
df = categorize_numerical_feature(df, 'Age', 'Status')
df['Age_categorized'].value_counts()

<= 61.5    3027
> 61.5      997
Name: Age_categorized, dtype: int64

In [276]:
df = categorize_numerical_feature(df, 'Tumor Size', 'Status')
df['Tumor Size_categorized'].value_counts()

> 17.5     2848
<= 17.5    1176
Name: Tumor Size_categorized, dtype: int64

In [277]:
df = categorize_numerical_feature(df, 'Survival Months', 'Status')
df['Survival Months_categorized'].value_counts()

> 47.5     3551
<= 47.5     473
Name: Survival Months_categorized, dtype: int64

In [278]:
df

Unnamed: 0,Age,Marital Status,6th Stage,Tumor Size,Survival Months,Status,Age_categorized,Tumor Size_categorized,Survival Months_categorized
0,68,1,0,4,60,0,> 61.5,<= 17.5,> 47.5
1,50,1,2,35,62,0,<= 61.5,> 17.5,> 47.5
2,58,0,4,63,75,0,<= 61.5,> 17.5,> 47.5
3,58,1,0,18,84,0,<= 61.5,> 17.5,> 47.5
4,47,1,1,41,50,0,<= 61.5,> 17.5,> 47.5
...,...,...,...,...,...,...,...,...,...
4019,62,1,0,9,49,0,> 61.5,<= 17.5,> 47.5
4020,56,0,2,46,69,0,<= 61.5,> 17.5,> 47.5
4021,68,1,1,22,69,0,> 61.5,> 17.5,> 47.5
4022,58,0,1,44,72,0,<= 61.5,> 17.5,> 47.5


### 0 means "Alive", 1 means "Dead"

In [279]:
dead_count=int(df['Status'].value_counts().get(1,0))
alive_count=int(df['Status'].value_counts().get(0,0))
dataset_size=dead_count+alive_count
print(f'Dateset size is {dataset_size} \n{alive_count} are alive and {dead_count} are dead')

Dateset size is 4024 
3408 are alive and 616 are dead


# Spliting

In [280]:
df1=df[['Status','Age_categorized','Marital Status','6th Stage','Survival Months_categorized','Tumor Size_categorized']].copy()

In [281]:
df1

Unnamed: 0,Status,Age_categorized,Marital Status,6th Stage,Survival Months_categorized,Tumor Size_categorized
0,0,> 61.5,1,0,> 47.5,<= 17.5
1,0,<= 61.5,1,2,> 47.5,> 17.5
2,0,<= 61.5,0,4,> 47.5,> 17.5
3,0,<= 61.5,1,0,> 47.5,> 17.5
4,0,<= 61.5,1,1,> 47.5,> 17.5
...,...,...,...,...,...,...
4019,0,> 61.5,1,0,> 47.5,<= 17.5
4020,0,<= 61.5,0,2,> 47.5,> 17.5
4021,0,> 61.5,1,1,> 47.5,> 17.5
4022,0,<= 61.5,0,1,> 47.5,> 17.5


In [282]:
X = df1[['Age_categorized','Marital Status','6th Stage','Survival Months_categorized','Tumor Size_categorized']]
X[0:5]

Unnamed: 0,Age_categorized,Marital Status,6th Stage,Survival Months_categorized,Tumor Size_categorized
0,> 61.5,1,0,> 47.5,<= 17.5
1,<= 61.5,1,2,> 47.5,> 17.5
2,<= 61.5,0,4,> 47.5,> 17.5
3,<= 61.5,1,0,> 47.5,> 17.5
4,<= 61.5,1,1,> 47.5,> 17.5


In [283]:
y=df1['Status']
y.value_counts()

0    3408
1     616
Name: Status, dtype: int64

In [284]:
trainset, testset = train_test_split(df1, test_size=0.3, random_state=3)

In [285]:
print(f'Train set size :{len(trainset)}\nTest set size :{len(testset)}')

Train set size :2816
Test set size :1208


# Building tree

### Functions to Calculate IG

In [286]:
def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

def information_gain(y, feature_values):
    total_entropy = entropy(y)
    weighted_entropy = 0
    unique_values = np.unique(feature_values)
    
    for value in unique_values:
        subset_indices = feature_values == value
        subset_entropy = entropy(y[subset_indices])
        weighted_entropy += (np.sum(subset_indices) / len(y)) * subset_entropy
        
    return total_entropy - weighted_entropy

### Function to select the best feature to splite

In [287]:
def best_feature_to_split(df, target_name):
    features = df.columns.drop(target_name)
    best_gain = -1
    best_feature = None
    
    for feature in features:
        gain = information_gain(df[target_name], df[feature])
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
            
    return best_feature

### Main function to build the tree

In [288]:
def build_tree(df, target_name):
    # If all target values are the same, return a leaf node
    if len(np.unique(df[target_name])) == 1:
        return df[target_name].iloc[0]
    
    # If there are no more features to split on, return the most common target value
    if len(df.columns) == 1:
        return df[target_name].mode()[0]
    
    # Choose the best feature to split on
    best_feature = best_feature_to_split(df, target_name)
    if best_feature is None:
        return df[target_name].mode()[0]
    
    # Create the tree structure
    tree = {best_feature: {}}
    unique_values = np.unique(df[best_feature])
    
    # Split the dataset and recursively build subtrees
    for value in unique_values:
        subset = df[df[best_feature] == value].drop(columns=[best_feature])
        subtree = build_tree(subset, target_name)
        tree[best_feature][value] = subtree
        
    return tree

In [289]:
tree = build_tree(trainset, 'Status')

In [290]:
tree

{'Survival Months_categorized': {'<= 47.5': {'Age_categorized': {'<= 61.5': {'6th Stage': {0: {'Marital Status': {0: {'Tumor Size_categorized': {'<= 17.5': 0,
          '> 17.5': 1}},
        1: {'Tumor Size_categorized': {'<= 17.5': 0, '> 17.5': 0}},
        2: 0,
        3: {'Tumor Size_categorized': {'<= 17.5': 0, '> 17.5': 1}},
        4: 0}},
      1: {'Marital Status': {0: {'Tumor Size_categorized': {'> 17.5': 0}},
        1: {'Tumor Size_categorized': {'> 17.5': 0}},
        2: 1,
        3: {'Tumor Size_categorized': {'> 17.5': 1}},
        4: {'Tumor Size_categorized': {'> 17.5': 0}}}},
      2: {'Marital Status': {0: {'Tumor Size_categorized': {'> 17.5': 1}},
        1: {'Tumor Size_categorized': {'<= 17.5': 1, '> 17.5': 1}},
        2: 1,
        3: {'Tumor Size_categorized': {'<= 17.5': 0, '> 17.5': 1}},
        4: {'Tumor Size_categorized': {'<= 17.5': 1, '> 17.5': 0}}}},
      3: {'Marital Status': {0: {'Tumor Size_categorized': {'> 17.5': 0}},
        1: 1,
        3: 1}

# Predict

In [291]:
y_testset=testset['Status'].copy()
y_testset

3027    0
1740    0
2878    1
879     0
658     0
       ..
1327    0
3139    0
2374    0
3013    0
3954    0
Name: Status, Length: 1208, dtype: int32

In [295]:
x_testset=testset.drop(columns=['Status'])
x_testset

Unnamed: 0,Age_categorized,Marital Status,6th Stage,Survival Months_categorized,Tumor Size_categorized
3027,<= 61.5,3,0,> 47.5,<= 17.5
1740,> 61.5,1,0,> 47.5,<= 17.5
2878,<= 61.5,3,4,> 47.5,> 17.5
879,<= 61.5,1,0,> 47.5,> 17.5
658,<= 61.5,1,4,> 47.5,> 17.5
...,...,...,...,...,...
1327,> 61.5,1,0,> 47.5,> 17.5
3139,<= 61.5,1,1,> 47.5,> 17.5
2374,> 61.5,1,0,> 47.5,<= 17.5
3013,<= 61.5,1,1,> 47.5,> 17.5


In [305]:
def predict_single(tree, sample):
    while isinstance(tree, dict):
        feature = next(iter(tree))
        value = sample[feature]
        tree = tree[feature].get(value)
        if tree is None:  # Handle cases where the value is not in the tree
            return None  # Or some default value
    return tree

def dataframe_predict(tree, test_set):
    predictions = test_set.apply(lambda x: predict_single(tree, x), axis=1)
    return predictions

In [304]:
sample = {
    'Survival Months_categorized': '> 47.5',
    'Age_categorized': '> 61.5',
    '6th Stage': 0,
    'Tumor Size_categorized': '> 17.5',
    'Marital Status': 0
}
sample2 = {
    'Survival Months_categorized': '<= 47.5',
    'Age_categorized': '> 61.5',
    '6th Stage': 3,
    'Tumor Size_categorized': '> 17.5',
    'Marital Status': 4
}
sample3 = {
    'Survival Months_categorized': '> 47.5',
    'Age_categorized': '> 61.5',
    '6th Stage': 2,
    'Tumor Size_categorized': '> 17.5',
    'Marital Status': 1
}
# Assuming 'tree' is already defined as your decision tree structure
prediction1 = predict_single(tree, sample)
prediction2 = predict_single(tree, sample2)
prediction3 = predict_single(tree, sample3)

print(f"Predicted class for sample 1: {prediction1}\nPredicted class for sample 2: {prediction2}\nPredicted class for sample 3: {prediction3}")

Predicted class for sample 1: 0
Predicted class for sample 2: 1
Predicted class for sample 3: 0


# Evaluation

In [319]:
predictions = dataframe_predict(tree, testset)
predictions_list = predictions.tolist()
predictions_list[:5]

[0.0, 0.0, 0.0, 0.0, 0.0]

In [317]:
y_testset=y_testset.tolist()
predictions_list = predictions.tolist()
y_testset[:5]

[0, 0, 1, 0, 0]

In [320]:
for i in range(5):
    if not(predictions_list[i]==1):
        print (y_testset[i])

0
0
1
0
0


In [324]:
def Accuracy(y_pred,y_actual):
    T_np=0
    F_np=0
    for i in range(len(y_pred)):
        if y_pred[i]==y_actual[i]:
            T_np+=1
        else:
            F_np+=1
    acc=T_np/(T_np+F_np)
    return acc

In [325]:
acc=Accuracy(predictions_list,y_testset)
acc

0.8990066225165563