# IMPORT Libraries

In [124]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# READ CSV

In [125]:
my_data = pd.read_csv("Breast_Cancer.csv", delimiter=",")
my_data[0:5]

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


# Gain some Information

In [126]:
my_data['Status'].unique()

array(['Alive', 'Dead'], dtype=object)

In [127]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

# Create DataFrame

In [128]:
df=pd.DataFrame(my_data)

# Information about quantity of values for each column

In [129]:
for col in df.columns:
    if not pd.api.types.is_integer_dtype(df[col]):  # Check if column is not integer type
        print(f"Value counts for {col}:")
        print(df[col].value_counts())
        print()  # Print an empty line for separation

Value counts for Race:
White    3413
Other     320
Black     291
Name: Race, dtype: int64

Value counts for Marital Status:
Married      2643
Single        615
Divorced      486
Widowed       235
Separated      45
Name: Marital Status, dtype: int64

Value counts for T Stage :
T2    1786
T1    1603
T3     533
T4     102
Name: T Stage , dtype: int64

Value counts for N Stage:
N1    2732
N2     820
N3     472
Name: N Stage, dtype: int64

Value counts for 6th Stage:
IIA     1305
IIB     1130
IIIA    1050
IIIC     472
IIIB      67
Name: 6th Stage, dtype: int64

Value counts for differentiate:
Moderately differentiated    2351
Poorly differentiated        1111
Well differentiated           543
Undifferentiated               19
Name: differentiate, dtype: int64

Value counts for Grade:
2                        2351
3                        1111
1                         543
 anaplastic; Grade IV      19
Name: Grade, dtype: int64

Value counts for A Stage:
Regional    3932
Distant       92
Nam

# Gain some knowledge about numerical features

In [130]:
numerical_columns = df.select_dtypes(include=['int', 'float']).columns

# Describe numerical columns
numerical_stats = df[numerical_columns].describe()

print("Descriptive statistics for numerical columns:")
print(numerical_stats)

Descriptive statistics for numerical columns:
               Age   Tumor Size  Regional Node Examined  \
count  4024.000000  4024.000000             4024.000000   
mean     53.972167    30.473658               14.357107   
std       8.963134    21.119696                8.099675   
min      30.000000     1.000000                1.000000   
25%      47.000000    16.000000                9.000000   
50%      54.000000    25.000000               14.000000   
75%      61.000000    38.000000               19.000000   
max      69.000000   140.000000               61.000000   

       Reginol Node Positive  Survival Months  
count            4024.000000      4024.000000  
mean                4.158052        71.297962  
std                 5.109331        22.921430  
min                 1.000000         1.000000  
25%                 1.000000        56.000000  
50%                 2.000000        73.000000  
75%                 5.000000        90.000000  
max                46.000000       107

# Feature Selection

In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

# Feature selection by Descision Trees feature importance

In [132]:
selected_columns = ['Age', 'Tumor Size', 'Regional Node Examined','Survival Months','Reginol Node Positive',]  # Replace with your actual feature names
# Separate features and target
X = df[selected_columns]
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [133]:
# Get feature importances
importances = clf.feature_importances_
importance_series = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("Feature Importances from Decision Tree:")
print(importance_series)


Feature Importances from Decision Tree:
Survival Months           0.468858
Age                       0.154544
Tumor Size                0.152395
Regional Node Examined    0.130100
Reginol Node Positive     0.094103
dtype: float64


## "Regional Node Examined" and "Reginol Node Positive" should be dropped

In [134]:
categorical_columns = ['Marital Status', 'differentiate','Race','T Stage ','Progesterone Status','N Stage','6th Stage','Grade','A Stage','Estrogen Status']  # Your actual feature names

# Encode categorical features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
                

In [135]:
X = df[categorical_columns]
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [136]:
# Get feature importances
importances = clf.feature_importances_
importance_series = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("Feature Importances from Decision Tree:")
print(importance_series)


Feature Importances from Decision Tree:
Marital Status         0.218051
6th Stage              0.199337
T Stage                0.151596
Race                   0.090615
Estrogen Status        0.079766
Progesterone Status    0.077521
Grade                  0.070915
N Stage                0.049233
differentiate          0.032803
A Stage                0.030164
dtype: float64


## "Marital Status" and "6th stage" are selected

# Selecting top features
## There are 3 numerical, 1 categorical, and 1 binary columns

In [137]:
columns_to_drop=['T Stage ','N Stage','differentiate','Race','Grade','A Stage','Estrogen Status','Progesterone Status','Reginol Node Positive','Regional Node Examined']
df.drop(columns=columns_to_drop, inplace=True)  # Use inplace=True to modify the original DataFrame


In [138]:
df

Unnamed: 0,Age,Marital Status,6th Stage,Tumor Size,Survival Months,Status
0,68,1,0,4,60,Alive
1,50,1,2,35,62,Alive
2,58,0,4,63,75,Alive
3,58,1,0,18,84,Alive
4,47,1,1,41,50,Alive
...,...,...,...,...,...,...
4019,62,1,0,9,49,Alive
4020,56,0,2,46,69,Alive
4021,68,1,1,22,69,Alive
4022,58,0,1,44,72,Alive
