In [65]:
#Mounting Google Colab to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
#Importing the required packages
import plotly
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import seaborn as sns
from scipy.stats import spearmanr

In [67]:
#Upload the dataset
cancer  = pd.read_csv("/content/drive/MyDrive/Analysis Projects/Breast Cancer Analysis/Dataset/cancer.csv")

In [68]:
#View the top 5 columns
cancer.head(5)

Unnamed: 0.1,Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status
0,1,132,49,0,18,2,2,0,0,0,1838,0
1,2,1575,55,1,20,3,16,0,0,0,403,1
2,3,1140,56,1,40,3,3,0,0,0,1603,0
3,4,769,45,0,25,3,0,0,4,0,177,0
4,5,130,65,1,30,2,5,0,36,1,1855,0


# **Data Cleaning**

In [69]:
cancer.dtypes

Unnamed: 0    int64
pid           int64
age           int64
meno          int64
size          int64
grade         int64
nodes         int64
pgr           int64
er            int64
hormon        int64
rfstime       int64
status        int64
dtype: object

In [70]:
# Replace coded values by their labels
cancer["meno"] = cancer["meno"].replace({0: "premenopausal", 1: "postmenopausal"})
cancer["hormon"] = cancer["hormon"].replace({0: "no", 1: "yes"})
cancer["status"] = cancer["status"].replace({0: "alive without recurrence", 1 : "recurrence or death"})

In [71]:
cancer

Unnamed: 0.1,Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status
0,1,132,49,premenopausal,18,2,2,0,0,no,1838,alive without recurrence
1,2,1575,55,postmenopausal,20,3,16,0,0,no,403,recurrence or death
2,3,1140,56,postmenopausal,40,3,3,0,0,no,1603,alive without recurrence
3,4,769,45,premenopausal,25,3,0,0,4,no,177,alive without recurrence
4,5,130,65,postmenopausal,30,2,5,0,36,yes,1855,alive without recurrence
...,...,...,...,...,...,...,...,...,...,...,...,...
681,682,586,51,premenopausal,30,3,2,1152,38,yes,1760,alive without recurrence
682,683,1273,64,postmenopausal,26,2,2,1356,1144,yes,1152,alive without recurrence
683,684,1525,57,postmenopausal,35,3,1,1490,209,yes,1342,alive without recurrence
684,685,736,44,premenopausal,21,2,3,1600,70,no,629,alive without recurrence


In [72]:
#Check for null values
cancer.isnull().sum()


Unnamed: 0    0
pid           0
age           0
meno          0
size          0
grade         0
nodes         0
pgr           0
er            0
hormon        0
rfstime       0
status        0
dtype: int64

In [73]:
#Check for duplicate values
cancer[cancer.duplicated()]

Unnamed: 0.1,Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status


In [74]:
#Drop Unwanted columns
cancer.drop('Unnamed: 0', axis=1, inplace=True)
cancer

Unnamed: 0,pid,age,meno,size,grade,nodes,pgr,er,hormon,rfstime,status
0,132,49,premenopausal,18,2,2,0,0,no,1838,alive without recurrence
1,1575,55,postmenopausal,20,3,16,0,0,no,403,recurrence or death
2,1140,56,postmenopausal,40,3,3,0,0,no,1603,alive without recurrence
3,769,45,premenopausal,25,3,0,0,4,no,177,alive without recurrence
4,130,65,postmenopausal,30,2,5,0,36,yes,1855,alive without recurrence
...,...,...,...,...,...,...,...,...,...,...,...
681,586,51,premenopausal,30,3,2,1152,38,yes,1760,alive without recurrence
682,1273,64,postmenopausal,26,2,2,1356,1144,yes,1152,alive without recurrence
683,1525,57,postmenopausal,35,3,1,1490,209,yes,1342,alive without recurrence
684,736,44,premenopausal,21,2,3,1600,70,no,629,alive without recurrence


In [75]:
#Check column data types
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   pid      686 non-null    int64 
 1   age      686 non-null    int64 
 2   meno     686 non-null    object
 3   size     686 non-null    int64 
 4   grade    686 non-null    int64 
 5   nodes    686 non-null    int64 
 6   pgr      686 non-null    int64 
 7   er       686 non-null    int64 
 8   hormon   686 non-null    object
 9   rfstime  686 non-null    int64 
 10  status   686 non-null    object
dtypes: int64(8), object(3)
memory usage: 59.1+ KB


In [76]:
#Understand the dataset further
cancer.describe()

Unnamed: 0,pid,age,size,grade,nodes,pgr,er,rfstime
count,686.0,686.0,686.0,686.0,686.0,686.0,686.0,686.0
mean,966.061224,53.052478,29.329446,2.116618,5.008746,109.995627,96.252187,1124.489796
std,495.506249,10.120739,14.296217,0.582808,5.476685,202.331552,153.083963,642.791948
min,1.0,21.0,3.0,1.0,0.0,0.0,0.0,8.0
25%,580.75,46.0,20.0,2.0,1.0,7.0,8.0,567.75
50%,1015.5,53.0,25.0,2.0,3.0,32.5,36.0,1084.0
75%,1340.5,61.0,35.0,2.0,7.0,131.75,114.0,1684.75
max,1819.0,80.0,120.0,3.0,51.0,2380.0,1144.0,2659.0


#Exploratory Data Analysis
* What is the distribution of patient ages in the dataset?
* How many patients are premenopausal and how many are postmenopausal?
* What is the tumor size distribution among patients?
* Is there a correlation between tumor grade and number of positive lymph nodes?
* What is the avarege distribution of progesterone receptor levels among the patients?
* How many patients received hormonal therapy?
* How many patients experienced recurrence or death during the study period?
* Is there a significant difference in recurrence-free survival time between patients who received hormonal therapy and those who did not?


#1. What is the distribution of patient ages in the dataset?

In [77]:
figure = px.histogram(cancer,
                      x = "age",
                      title = "Age Distribution of Patients",
                      labels = {"age": "Age", "count": "Frequency"})
figure.show()

#Obervartion:
The infection rates in patients show a distinct pattern across different age groups. Middle-aged patients, particularly those between the ages of 46 and 47, exhibit the highest frequency of infection at 64, followed by patients aged 62-63, 48-49, and 64-65, with frequencies of 52, 51, and 50, respectively. Infection rates are relatively low in patients below 30 but steadily rise until 45. However, the infection curve declines in patients aged 66 to 80. These findings suggest that middle-aged patients are more susceptible to infection, while younger and older patients tend to have lower infection rates.

#2.How many patients are premenopausal and how many are postmenopausal?

In [78]:
from plotly.graph_objs import XAxis
# Select the meno column
meno_column = cancer['meno'].value_counts()

#Create a bar chart to display the finding
patients = px.bar(meno_column,
                  y = "meno",
                  title = "Premenopausal and Postmenopausal Patients",
                  labels = {"meno": "Frequency", "index": "Type"})
patients.show()

#Observation:
Postmenopausal patients have a frequency of 396, indicating a higher number of cases, while premenopausal patients have a frequency of 290. This suggests that postmenopausal patients are at a higher risk of developing breast cancer than premenopausal patients.

# 3. What is the tumor size distribution among patients?

In [79]:
distribution = px.histogram(cancer,                 
                          y = "size",
                          x = "age",
                          title = "Tumor Distribution Among Patients",
                          labels = {"size": "Tumor Size", "age": "Age Distribution"})
distribution.show()

#Observation:
The analysis of tumour size distribution among different age groups reveals interesting patterns. Middle-aged patients, particularly those aged 46-47, exhibit the highest sum of tumour size, suggesting a potential correlation between age and tumour growth. This is followed by patients aged 50-51 and 48-49, showing relatively high sums of tumour size. However, the sum of tumour size decreases slightly in patients aged 52-53 and further drops in those aged 56-57. Notably, patients below 30 have a significantly lower sum tumour size, gradually increasing from ages 30 to 44-45. The decline in tumour size becomes evident in patients aged above 66. Overall, this analysis may depict age as a potential factor influencing tumour growth, with a tendency for larger tumours in middle-aged patients and a decline in tumour size in older age groups.

# 4. Is there a correlation between tumor grade and number of positive lymph nodes?

In [80]:
# Check the correlation between 'grade' and 'node'
correlation, p_value = spearmanr(cancer['grade'], cancer['nodes'])

# Format the correlation coefficient and p-value to four decimal places
correlation_formatted = format(correlation, '.4f')
p_value_formatted = format(p_value, '.4f')

# Print the correlation coefficient and p-value
print(f"Correlation coefficient: {correlation_formatted}")
print(f"P-value: {p_value_formatted}")

Correlation coefficient: 0.1247
P-value: 0.0011


# Observation:
The correlation coefficient of **(0.1247)** indicates a weak positive correlation between tumour grade and the number of positive lymph nodes, suggesting that as tumour grade increases, there is a slight increase in the number of positive lymph nodes. The statistical significance of the correlation **(p-value of 0.0011)** suggests that the relationship is unlikely to have occurred by chance. However, it's important to remember that correlation does not imply causation, and additional analysis and consideration of other factors are necessary to understand the relationship between these variables better.

#5. What is the average distribution of progesterone receptor levels among the patients?

In [81]:
# Group the columns
group = cancer.groupby(["age"]).mean()[["pgr", "er"]].reset_index()

# Reshape the data using melt
melted_data = pd.melt(group, id_vars=["age"], value_vars=["pgr", "er"], var_name="variable", value_name="Value")

# Plot the line graph
fig = px.line(melted_data, 
              x="age", 
              y="Value", 
              color="variable", 
              title="Average Distribution of PGR and ER Levels Among Patients",
              line_shape = "spline")
fig.show()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



#Observation:
The observation reveals that among patients below the age of 68, the average range of progesterone receptors and estrogen receptors is between 0 and 200. Beyond this age, there is a slight increase in both receptor levels. Notably, estrogen receptors remain higher than progesterone receptors among patients older than 68 up to 79. However, patients aged 80 show higher average levels of progesterone receptors (1190) compared to estrogen receptors (515.5). These findings suggest age-related variations in hormone receptor levels, with progesterone receptor levels increasing in older individuals. Further analysis is required to comprehend the implications for individual patients' health.

#6. How many patients received hormonal therapy?

In [82]:
#Create a bar chart to display the finding
therapy = px.pie(cancer,
                  names = "hormon",
                 hole = 0.5,
                  title = "Percentage of Patients who have received hormonal therapy")
therapy.show()

#Observation:
Among the 686 patients analyzed, only 35.9% (246 patients) have undergone hormonal therapy, while the majority, 64.1% (440 patients), have not received hormonal therapy. This suggests that a significant portion of the patients in the study population have not utilized hormonal therapy as part of their breast cancer treatment. Further analysis would be required to determine the reasons behind the variation in hormonal therapy utilization and its potential impact on treatment outcomes and patient prognosis.

#7. How many patients experienced recurrence or death during the study period?

In [85]:
# Group the data by status and calculate the count
grouped_data = cancer.groupby("status").count()["age"].reset_index()
# Create the donut chart
fig = px.pie(grouped_data, 
             values="age", 
             names="status", 
             hole=0.5, 
             title="Status Distribution")

# Show the chart
fig.show()

#Observation:
Approximately 43.6% (299) of the patients in the dataset experienced recurrence or died, indicating a concerning outcome for this subgroup. On the other hand, approximately 56.4% (387) of the patients are reported as alive without recurrence. This could imply that the treatment provided to the patients may not be effective enough in preventing cancer recurrence. It also highlights the importance of continuous monitoring and follow-up of patients treated for breast cancer to detect any signs of recurrence early and provide timely intervention.

#The Correlation Matrix

In [89]:
# Compute the correlation matrix
correlation_matrix = cancer.corr()

# Display the correlation matrix
print(correlation_matrix)

              pid       age      size     grade     nodes       pgr        er  \
pid      1.000000 -0.012263  0.040966  0.017734  0.080657  0.008087  0.018122   
age     -0.012263  1.000000 -0.045412 -0.072318  0.032914  0.084355  0.323132   
size     0.040966 -0.045412  1.000000  0.103488  0.327674 -0.027415 -0.081766   
grade    0.017734 -0.072318  0.103488  1.000000  0.143751 -0.180905 -0.119663   
nodes    0.080657  0.032914  0.327674  0.143751  1.000000 -0.072373 -0.043013   
pgr      0.008087  0.084355 -0.027415 -0.180905 -0.072373  1.000000  0.392601   
er       0.018122  0.323132 -0.081766 -0.119663 -0.043013  0.392601  1.000000   
rfstime -0.287225  0.053958 -0.138376 -0.171505 -0.256301  0.102729  0.065477   

          rfstime  
pid     -0.287225  
age      0.053958  
size    -0.138376  
grade   -0.171505  
nodes   -0.256301  
pgr      0.102729  
er       0.065477  
rfstime  1.000000  






In [93]:
#Plot the correlation
fig = px.imshow(correlation_matrix,
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                title = "Correlation Matrix")
fig.show()

#Observation:
Based on the correlation matrix, we can make the following observations:

1. Age (age) has a positive correlation with progesterone receptor (pgr) levels (0.084) and estrogen receptor (er) status (0.323). This suggests that as age increases, there is a tendency for higher progesterone receptor levels and a stronger likelihood of having a positive estrogen receptor status.

2. Tumor size (size) shows a moderate positive correlation with lymph nodes (nodes) (0.328). This implies that larger tumour sizes are associated with more affected lymph nodes.

3. Tumor grade (grade) exhibits a weak negative correlation with progesterone receptor (pgr) levels (-0.181). This suggests that higher tumour grade is associated with lower progesterone receptor levels.

4. Recurrence-free survival time (rfstime) shows a moderate negative correlation lymph nodes (nodes) (-0.256). This indicates that patients with more affected lymph nodes tend to have shorter recurrence-free survival times.

These observations provide insights into the relationships and potential dependencies among different variables in the dataset.