In [5]:
import pandas as pd

In [11]:
breast_cancer_df = pd.read_csv('breast-cancer.data')
breast_cancer_df.head(5)
# I notice there aren't column names, so i am going to add those
column_names = [
    'class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 
    'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat'
]
breast_cancer_df = pd.read_csv('breast-cancer.data', names=column_names)
breast_cancer_df
# cool now there are column names

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


In [50]:
# Demonstrating Melt
# using various parts of melt syntax like id_vars to name the column that does not melt, value_vars to specify what columns to melt,
# var_name to rename the column of my melted columns, and ignore_index as False to make a new index that reflects the melted dataframe
breast_cancer_df.melt(id_vars='tumor-size', value_vars=['class', 'age', 'menopause'], 
                      var_name= 'Class, Age, Menopause', ignore_index=False)



Unnamed: 0,tumor-size,"Class, Age, Menopause",value
0,30-34,class,no-recurrence-events
1,20-24,class,no-recurrence-events
2,20-24,class,no-recurrence-events
3,15-19,class,no-recurrence-events
4,0-4,class,no-recurrence-events
...,...,...,...
281,30-34,menopause,premeno
282,20-24,menopause,premeno
283,20-24,menopause,ge40
284,30-34,menopause,ge40


In [66]:
# Demonstrating pivot
# used pivot_table as it is more flexible and can handle duplicate values
# it automatically takes the average of the deg-malig values to represent a given cell
breast_cancer_df.pivot_table(index='age', columns='class', values='deg-malig')

class,no-recurrence-events,recurrence-events
age,Unnamed: 1_level_1,Unnamed: 2_level_1
20-29,2.0,
30-39,2.0,2.333333
40-49,1.936508,2.37037
50-59,1.943662,2.44
60-69,1.775,2.470588
70-79,1.6,1.0


In [70]:
# demonstrating aggregation. Here each row is an age group, the columns are tumor sizes, and the values are how many occurences
# are in each age group and tumor size. The count is done via aggragation
breast_cancer_df.pivot_table(index='age', columns='tumor-size', values='class', aggfunc='count')

tumor-size,0-4,10-14,15-19,20-24,25-29,30-34,35-39,40-44,45-49,5-9,50-54
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20-29,,,,,,,1.0,,,,
30-39,2.0,2.0,5.0,6.0,6.0,7.0,3.0,4.0,,1.0,
40-49,2.0,8.0,5.0,21.0,18.0,20.0,7.0,5.0,1.0,1.0,2.0
50-59,3.0,9.0,10.0,14.0,21.0,20.0,7.0,8.0,,1.0,3.0
60-69,,8.0,9.0,8.0,9.0,13.0,1.0,3.0,2.0,1.0,3.0
70-79,1.0,1.0,1.0,1.0,,,,2.0,,,


In [80]:
# Demonstrating iteration, a slow way
for i, row in breast_cancer_df.iterrows():
    print(i, row['class'])

# a faster way 
for row in breast_cancer_df.itertuples():
    print(row.age)


0 no-recurrence-events
1 no-recurrence-events
2 no-recurrence-events
3 no-recurrence-events
4 no-recurrence-events
5 no-recurrence-events
6 no-recurrence-events
7 no-recurrence-events
8 no-recurrence-events
9 no-recurrence-events
10 no-recurrence-events
11 no-recurrence-events
12 no-recurrence-events
13 no-recurrence-events
14 no-recurrence-events
15 no-recurrence-events
16 no-recurrence-events
17 no-recurrence-events
18 no-recurrence-events
19 no-recurrence-events
20 no-recurrence-events
21 no-recurrence-events
22 no-recurrence-events
23 no-recurrence-events
24 no-recurrence-events
25 no-recurrence-events
26 no-recurrence-events
27 no-recurrence-events
28 no-recurrence-events
29 no-recurrence-events
30 no-recurrence-events
31 no-recurrence-events
32 no-recurrence-events
33 no-recurrence-events
34 no-recurrence-events
35 no-recurrence-events
36 no-recurrence-events
37 no-recurrence-events
38 no-recurrence-events
39 no-recurrence-events
40 no-recurrence-events
41 no-recurrence-events
42

In [None]:
# Demonstrating groupby
# grouping by menopause status, and used the aggregate function to take the average of the degree malignance for each group
breast_cancer_df.groupby('menopause').agg(mean=('deg-malig', 'mean'))

Unnamed: 0_level_0,mean
menopause,Unnamed: 1_level_1
ge40,2.093023
lt40,1.714286
premeno,2.026667
