In [1]:
#pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import random
import string
import numpy as np

In [4]:
# import dataset
#https://archive.ics.uci.edu/dataset/14/breast+cancer
#from the url- click the import to python button and copied the code given to import the data frame 

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# data (as pandas dataframes) 
X = breast_cancer.data.features 
y = breast_cancer.data.targets 

  
# metadata 
print(breast_cancer.metadata) 
  
# variable information 
print(breast_cancer.variables)

#print(breast_cancer)

{'uci_id': 14, 'name': 'Breast Cancer', 'repository_url': 'https://archive.ics.uci.edu/dataset/14/breast+cancer', 'data_url': 'https://archive.ics.uci.edu/static/public/14/data.csv', 'abstract': 'Breast Cancer Data (Restricted Access)', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 286, 'num_features': 9, 'feature_types': ['Categorical'], 'demographics': ['Age'], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1988, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C51P4M', 'creators': ['Matjaz Zwitter', 'Milan Soklic'], 'intro_paper': None, 'additional_info': {'summary': 'This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. (See also lymphography and primary-tumor.)\r\n\r\nThis data set includes 201 instances of one class and 85 instances of another 

In [5]:
#create a data frame from data set - this creates a usable set of data 

df= pd.DataFrame(data=X, columns=breast_cancer.variables['name']) 
df['Class']= y
print(df)


name                 Class    age menopause tumor-size inv-nodes node-caps  \
0     no-recurrence-events  30-39   premeno      30-34       0-2        no   
1     no-recurrence-events  40-49   premeno      20-24       0-2        no   
2     no-recurrence-events  40-49   premeno      20-24       0-2        no   
3     no-recurrence-events  60-69      ge40      15-19       0-2        no   
4     no-recurrence-events  40-49   premeno        0-4       0-2        no   
..                     ...    ...       ...        ...       ...       ...   
281      recurrence-events  30-39   premeno      30-34       0-2        no   
282      recurrence-events  30-39   premeno      20-24       0-2        no   
283      recurrence-events  60-69      ge40      20-24       0-2        no   
284      recurrence-events  40-49      ge40      30-34     5-Mar        no   
285      recurrence-events  50-59      ge40      30-34     5-Mar        no   

name  deg-malig breast breast-quad irradiat  
0             3  

In [7]:
#1- melt 

breast_cancer_melted= pd.melt(df, id_vars= ['menopause'], value_vars= ['age'],var_name= 'changedVarname', value_name= 'ChangedValuename')

print(breast_cancer_melted)

#alternative way 

columns_to_melt= ['menopause', 'age', 'breast']
df_melted = pd.melt(df, id_vars= columns_to_melt, var_name= "measurement", value_name= 'value') 
print(df_melted)

    menopause changedVarname ChangedValuename
0     premeno            age            30-39
1     premeno            age            40-49
2     premeno            age            40-49
3        ge40            age            60-69
4     premeno            age            40-49
..        ...            ...              ...
281   premeno            age            30-39
282   premeno            age            30-39
283      ge40            age            60-69
284      ge40            age            40-49
285      ge40            age            50-59

[286 rows x 3 columns]
     menopause    age breast measurement                 value
0      premeno  30-39   left       Class  no-recurrence-events
1      premeno  40-49  right       Class  no-recurrence-events
2      premeno  40-49   left       Class  no-recurrence-events
3         ge40  60-69  right       Class  no-recurrence-events
4      premeno  40-49  right       Class  no-recurrence-events
...        ...    ...    ...         ...      

In [11]:
#2- pivot 
df_pivoted= df_melted.pivot_table(index= columns_to_melt, columns= 'measurement', values='value', aggfunc='sum') #because of duplicates in the table 
df_pivoted.reset_index(inplace= True)
new_column_names=['name', 'age', 'menopause', 'tumor-size', 'inv-node', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']
df_pivoted.columns= new_column_names
print(df_pivoted)

       name    age menopause  \
0      ge40  40-49      left   
1      ge40  40-49     right   
2      ge40  50-59      left   
3      ge40  50-59     right   
4      ge40  60-69      left   
5      ge40  60-69     right   
6      ge40  70-79      left   
7      ge40  70-79     right   
8      lt40  30-39     right   
9      lt40  50-59      left   
10     lt40  50-59     right   
11     lt40  60-69      left   
12  premeno  20-29     right   
13  premeno  30-39      left   
14  premeno  30-39     right   
15  premeno  40-49      left   
16  premeno  40-49     right   
17  premeno  50-59      left   
18  premeno  50-59     right   

                                           tumor-size  \
0   no-recurrence-eventsno-recurrence-eventsno-rec...   
1   no-recurrence-eventsrecurrence-eventsrecurrenc...   
2   no-recurrence-eventsno-recurrence-eventsno-rec...   
3   no-recurrence-eventsno-recurrence-eventsno-rec...   
4   no-recurrence-eventsno-recurrence-eventsno-rec...   
5   no-recurrence

In [25]:
#3- aggregation- basic statistical operations 
#I did use aggregation above because duplicates were seen in the table 
#print(df)
#max tumor size
a=df['tumor-size'].max()
print(a)
#min tumor size
b=df['tumor-size'].min()
print(b)
#inv nodes size
c=df['inv-nodes'].max()
print(c)
#inv nodes size
d=df['inv-nodes'].min()
print(d)

name                 Class    age menopause tumor-size inv-nodes node-caps  \
0     no-recurrence-events  30-39   premeno      30-34       0-2        no   
1     no-recurrence-events  40-49   premeno      20-24       0-2        no   
2     no-recurrence-events  40-49   premeno      20-24       0-2        no   
3     no-recurrence-events  60-69      ge40      15-19       0-2        no   
4     no-recurrence-events  40-49   premeno        0-4       0-2        no   
..                     ...    ...       ...        ...       ...       ...   
281      recurrence-events  30-39   premeno      30-34       0-2        no   
282      recurrence-events  30-39   premeno      20-24       0-2        no   
283      recurrence-events  60-69      ge40      20-24       0-2        no   
284      recurrence-events  40-49      ge40      30-34     5-Mar        no   
285      recurrence-events  50-59      ge40      30-34     5-Mar        no   

name  deg-malig breast breast-quad irradiat  
0             3  

In [28]:
#4- iteration 

for index, row in df.iterrows():
    print(index, row['breast'])

0 left
1 right
2 left
3 right
4 right
5 left
6 left
7 left
8 left
9 right
10 left
11 left
12 left
13 left
14 left
15 left
16 left
17 left
18 left
19 right
20 left
21 left
22 right
23 left
24 right
25 left
26 left
27 right
28 right
29 left
30 right
31 right
32 left
33 right
34 left
35 left
36 left
37 right
38 left
39 left
40 right
41 left
42 left
43 left
44 right
45 left
46 right
47 right
48 right
49 right
50 left
51 left
52 left
53 left
54 right
55 right
56 right
57 right
58 left
59 right
60 left
61 right
62 left
63 right
64 left
65 right
66 right
67 left
68 left
69 left
70 left
71 right
72 left
73 right
74 left
75 left
76 left
77 left
78 left
79 right
80 right
81 left
82 right
83 right
84 left
85 left
86 right
87 left
88 right
89 right
90 right
91 left
92 right
93 right
94 left
95 right
96 left
97 right
98 left
99 left
100 left
101 right
102 right
103 left
104 right
105 right
106 left
107 right
108 right
109 right
110 left
111 right
112 right
113 right
114 right
115 right
116 left
117

In [39]:
#5- groupby 

#print(df)
#group by menopause with average of deg-malig 
f= df.groupby('menopause')['deg-malig'].mean()
print(f)

#group by left breast, age and menopause 
g= df[(df['breast']=='left')].groupby(['age','menopause']).size()
print(g)

menopause
ge40       2.093023
lt40       1.714286
premeno    2.026667
Name: deg-malig, dtype: float64
age    menopause
30-39  premeno      21
40-49  ge40          6
       premeno      35
50-59  ge40         34
       lt40          3
       premeno      19
60-69  ge40         28
       lt40          2
70-79  ge40          4
dtype: int64
