<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/effective_pandas_ch_16_thur_ch_XX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 16: Dataframes

In [66]:
# libraries needed
import numpy as np
import pandas as pd

# show plots inline
%matplotlib inline

In [67]:
# create a dataframe
# usually will import into python via sql
df = pd.DataFrame(
    [
    {'growth':.5, 'Name':'Paul'},
    {'growth':.7, 'Name':'George'},
    {'growth':1.2, 'Name':'Ringo'}
    ]
)

print(df)

   growth    Name
0     0.5    Paul
1     0.7  George
2     1.2   Ringo


In [68]:
# 3 rows, 2 columns
print(df.shape)

(3, 2)


In [69]:
# metadata about df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   growth  3 non-null      float64
 1   Name    3 non-null      object 
dtypes: float64(1), object(1)
memory usage: 176.0+ bytes


In [70]:
# column names
df.columns

Index(['growth', 'Name'], dtype='object')

In [71]:
# row slicing
df.iloc[:2]

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George


In [72]:
# access a particular column
df['growth']

0    0.5
1    0.7
2    1.2
Name: growth, dtype: float64

In [73]:
# convert growth column to float32
df['growth'] = df['growth'].astype('float32')

# check
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   growth  3 non-null      float32
 1   Name    3 non-null      object 
dtypes: float32(1), object(1)
memory usage: 164.0+ bytes
None


In [74]:
# axis 0 refers to the rows
# axis 1 refers to the columns

# sum down the rows; so axis = 0
df.sum(axis = 0)

growth                2.4
Name      PaulGeorgeRingo
dtype: object

In [75]:
# better way:
df['growth'].sum()

2.4

# Chapter 17: Similarities with Series and DataFrame
Not much here...showing what's possible with a DataFrame

In [76]:
# load data
url = 'https://github.com/mattharrison/datasets/raw/master/data/siena2018-pres.csv'
df = pd.read_csv(url, index_col = 0)
df.head()

Unnamed: 0,Seq.,President,Party,Bg,Im,Int,IQ,L,WR,AC,...,PL,RC,CAp,HE,EAp,DA,FPA,AM,EV,O
1,1,George Washington,Independent,7,7,1,10,1,6,2,...,18,1,1,1,1,2,2,1,2,1
2,2,John Adams,Federalist,3,13,4,4,24,14,31,...,28,17,4,13,15,19,13,16,10,14
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,...,5,5,7,20,4,6,9,7,5,5
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,...,9,10,6,14,7,11,19,11,8,7
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,...,12,8,11,9,9,10,5,6,9,8


In [77]:
# meta data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 1 to 44
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Seq.       44 non-null     object
 1   President  44 non-null     object
 2   Party      44 non-null     object
 3   Bg         44 non-null     int64 
 4   Im         44 non-null     int64 
 5   Int        44 non-null     int64 
 6   IQ         44 non-null     int64 
 7   L          44 non-null     int64 
 8   WR         44 non-null     int64 
 9   AC         44 non-null     int64 
 10  EAb        44 non-null     int64 
 11  LA         44 non-null     int64 
 12  CAb        44 non-null     int64 
 13  OA         44 non-null     int64 
 14  PL         44 non-null     int64 
 15  RC         44 non-null     int64 
 16  CAp        44 non-null     int64 
 17  HE         44 non-null     int64 
 18  EAp        44 non-null     int64 
 19  DA         44 non-null     int64 
 20  FPA        44 non-null     int64 


In [78]:
df.columns

Index(['Seq.', 'President', 'Party', 'Bg', 'Im', 'Int', 'IQ', 'L', 'WR', 'AC',
       'EAb', 'LA', 'CAb', 'OA', 'PL', 'RC', 'CAp', 'HE', 'EAp', 'DA', 'FPA',
       'AM', 'EV', 'O'],
      dtype='object')

In [79]:
# rename columns
df = df.rename(
    columns = {
        'Seq.': 'Seq',
        'Bg': 'Background',
        'Im': 'Imagination'
    }
)

df.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Int,IQ,L,WR,AC,...,PL,RC,CAp,HE,EAp,DA,FPA,AM,EV,O
1,1,George Washington,Independent,7,7,1,10,1,6,2,...,18,1,1,1,1,2,2,1,2,1
2,2,John Adams,Federalist,3,13,4,4,24,14,31,...,28,17,4,13,15,19,13,16,10,14
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,...,5,5,7,20,4,6,9,7,5,5
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,...,9,10,6,14,7,11,19,11,8,7
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,...,12,8,11,9,9,10,5,6,9,8


In [80]:
# create new column called average_rank, which is rank based on sum of all float64 columns
df.sum(axis = 1).rank(method = 'dense')

  


1      1.0
2     13.0
3      5.0
4      7.0
5      8.0
6     18.0
7     19.0
8     25.0
9     38.0
10    37.0
11    11.0
12    30.0
13    39.0
14    41.0
15    43.0
16     3.0
17    44.0
18    24.0
19    32.0
20    29.0
21    34.0
22    23.0
23    36.0
24    20.0
25     4.0
26    22.0
27    12.0
28    40.0
29    31.0
30    35.0
31     2.0
32     9.0
33     6.0
34    10.0
35    16.0
36    28.0
37    26.0
38    27.0
39    14.0
40    21.0
41    15.0
42    33.0
43    17.0
44    42.0
dtype: float64

In [81]:
# create a new column, average_rank, which is based on sum of the numeric variables

df['average_rank'] = (
    df
      .select_dtypes(include = 'number')  # select numeric columns
      .sum(axis = 1)                      # add up
      .rank(method = 'dense')             # assign a rank
)

df['average_rank']

1      1.0
2     13.0
3      5.0
4      7.0
5      8.0
6     18.0
7     19.0
8     25.0
9     38.0
10    37.0
11    11.0
12    30.0
13    39.0
14    41.0
15    43.0
16     3.0
17    44.0
18    24.0
19    32.0
20    29.0
21    34.0
22    23.0
23    36.0
24    20.0
25     4.0
26    22.0
27    12.0
28    40.0
29    31.0
30    35.0
31     2.0
32     9.0
33     6.0
34    10.0
35    16.0
36    28.0
37    26.0
38    27.0
39    14.0
40    21.0
41    15.0
42    33.0
43    17.0
44    42.0
Name: average_rank, dtype: float64

In [82]:
# see p. 200-201 for above wrapped up in a function, tweak function, along with explanation

# Chapter 18: Math Methods in DataFrames

In [83]:
# load data
url = 'https://github.com/mattharrison/datasets/raw/master/data/siena2018-pres.csv'
df = pd.read_csv(url, index_col = 0)

# clean up; copy-pasted from: https://github.com/mattharrison/effective_pandas_book/blob/main/17-30-dataframe-common-code.ipynb
def tweak_siena_pres(df):
    def int64_to_uint8(df_):
        cols = df_.select_dtypes('int64')
        return (df_
                .astype({col:'uint8' for col in cols}))


    return (df
     .rename(columns={'Seq.':'Seq'})    # 1
     .rename(columns={k:v.replace(' ', '_') for k,v in
        {'Bg': 'Background',
         'PL': 'Party leadership', 'CAb': 'Communication ability',
         'RC': 'Relations with Congress', 'CAp': 'Court appointments',
         'HE': 'Handling of economy', 'L': 'Luck',
         'AC': 'Ability to compromise', 'WR': 'Willing to take risks',
         'EAp': 'Executive appointments', 'OA': 'Overall ability',
         'Im': 'Imagination', 'DA': 'Domestic accomplishments',
         'Int': 'Integrity', 'EAb': 'Executive ability',
         'FPA': 'Foreign policy accomplishments',
         'LA': 'Leadership ability',
         'IQ': 'Intelligence', 'AM': 'Avoid crucial mistakes',
         'EV': "Experts' view", 'O': 'Overall'}.items()})
     .astype({'Party':'category'})  # 2
     .pipe(int64_to_uint8)  # 3
     .assign(Average_rank=lambda df_:(df_.select_dtypes('uint8') # 4
                 .sum(axis=1).rank(method='dense').astype('uint8')),
             Quartile=lambda df_:pd.qcut(df_.Average_rank, 4,
                 labels='1st 2nd 3rd 4th'.split())
            )
    )

pres = tweak_siena_pres(df)
pres.head(n = 5)

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,...,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,Avoid_crucial_mistakes,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,...,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,...,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,...,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,...,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,...,11,9,9,10,5,6,9,8,8,1st


In [84]:
# part of pres dataframe: all rows and columns from 'Background' thru 'Average_rank'
scores = (
    pres
      .loc[:,'Background':'Average_rank'] # includes 'Background' and 'Average_rank'
)

scores.head(n = 5)

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,...,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,Avoid_crucial_mistakes,Experts'_view,Overall,Average_rank
1,7,7,1,10,1,6,2,2,1,11,...,1,1,1,1,2,2,1,2,1,1
2,3,13,4,4,24,14,31,21,21,13,...,17,4,13,15,19,13,16,10,14,13
3,2,2,14,1,8,5,14,6,6,4,...,5,7,20,4,6,9,7,5,5,5
4,4,6,7,3,16,15,6,13,17,10,...,10,6,14,7,11,19,11,8,7,7
5,9,14,11,18,6,16,7,10,12,15,...,8,11,9,9,10,5,6,9,8,8


In [85]:
# part of scores: first 3 rows, first 4 columns
s1 = scores.iloc[:3, :4]
s1

Unnamed: 0,Background,Imagination,Integrity,Intelligence
1,7,7,1,10
2,3,13,4,4
3,2,2,14,1


In [86]:
# another part of scores: second row thru 6th row, first 5 columns
s2 = scores.iloc[1:6, :5]
s2

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck
2,3,13,4,4,24
3,2,2,14,1,8
4,4,6,7,3,16
5,9,14,11,18,6
6,1,9,6,5,29


In [87]:
# goal is to show index alignment
s1 + s2

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck
1,,,,,
2,6.0,26.0,8.0,8.0,
3,4.0,4.0,28.0,2.0,
4,,,,,
5,,,,,
6,,,,,


In [88]:
# section on duplicated index values; probably won't be relevant

# Chapter 19: Looping and Aggregation

In [89]:
# dataframe we're working with
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,...,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,Avoid_crucial_mistakes,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,...,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,...,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,...,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,...,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,...,11,9,9,10,5,6,9,8,8,1st


In [90]:
# skipped part on iterating

In [93]:
# one row per president; for each president, average score
(
    pres
      .loc[:,'Background':'Average_rank']   # all rows, columns 'Background' thru 'Average_rank'
      .mean(axis = 1)                       # average across columns (1)
)

1      3.681818
2     14.454545
3      6.545455
4      9.636364
5     10.454545
6     17.181818
7     19.590909
8     25.681818
9     36.909091
10    34.409091
11    13.318182
12    29.500000
13    37.454545
14    39.409091
15    42.000000
16     4.045455
17    42.272727
18    24.227273
19    30.136364
20    27.272727
21    31.454545
22    22.181818
23    32.818182
24    19.727273
25     5.227273
26    21.318182
27    13.590909
28    38.772727
29    29.909091
30    31.954545
31     3.909091
32    11.818182
33     9.227273
34    12.727273
35    15.272727
36    26.909091
37    26.000000
38    26.818182
39    14.545455
40    20.818182
41    14.636364
42    30.363636
43    15.818182
44    39.772727
dtype: float64