# Pandas

https://pandas.pydata.org/

10 Minutes to pandas

http://pandas.pydata.org/pandas-docs/stable/10min.html

Tutorials

http://pandas.pydata.org/pandas-docs/stable/tutorials.html

In [2]:
import numpy as np
import pandas as pd

**Series**

In [3]:
# pd.Series( data = [ ] , index = [ ] )

ser1 = pd.Series( [10, 20, 30] )
ser1

0    10
1    20
2    30
dtype: int64

In [6]:
ser1.index = ['a', 'b', 'c']
ser1

a    10
b    20
c    30
dtype: int64

In [7]:
ser2 = pd.Series( {'a':100, 'b':200, 'c':300} )
ser2

a    100
b    200
c    300
dtype: int64

In [10]:
ser3 = pd.Series( [1, 2, 3, 4], index = ['USA', 'Germany', 'France', 'Japan'] )
ser3

USA        1
Germany    2
France     3
Japan      4
dtype: int64

In [11]:
ser4 = pd.Series( [1, 2, 5, 4], index = ['USA', 'Germany', 'Italy', 'Japan'] )
ser4

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [12]:
ser3 + ser4

France     NaN
Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
dtype: float64

**DataFrame**

In [24]:
# pd.DataFrame( data = [ ], index = [ ], columns = [ ] )
df = pd.read_csv( 'gapminder.csv', index_col = 'Unnamed: 0' )
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population
0,2014,asia,Philippines,6598.0,70.7,100102249.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0
2,2014,asia,Palau,14078.0,,21094.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0


In [35]:
df['income'][:7]

0     6598.0
1     8038.0
2    14078.0
3     4619.0
4        NaN
5    15412.0
6    64020.0
Name: income, dtype: float64

In [50]:
df['gross_income'] = df['income'] * df['population']
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population,gross_income
0,2014,asia,Philippines,6598.0,70.7,100102249.0,660474600000.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0,52669670000.0
2,2014,asia,Palau,14078.0,,21094.0,296961300.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0,857038200000.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0,


In [51]:
df.drop( labels = 'gross_income', axis = 1) # 실제로는 삭제되지 않음


Unnamed: 0,year,continent,country,income,life_exp,population
0,2014,asia,Philippines,6598.0,70.7,100102249.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0
2,2014,asia,Palau,14078.0,,21094.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0
...,...,...,...,...,...,...
54484,1800,americas,St.-Pierre-et-Miquelon,,,1782.0
54485,1800,europe,Svalbard,,,50.0
54486,1800,asia,Tokelau,,,1009.0
54487,1800,asia,United Korea (former),,,13740000.0


In [52]:
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population,gross_income
0,2014,asia,Philippines,6598.0,70.7,100102249.0,660474600000.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0,52669670000.0
2,2014,asia,Palau,14078.0,,21094.0,296961300.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0,857038200000.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0,


In [53]:
df.drop( labels = 'gross_income', axis = 1, inplace = True )

In [54]:
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population
0,2014,asia,Philippines,6598.0,70.7,100102249.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0
2,2014,asia,Palau,14078.0,,21094.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0


Selecting rows and columns

In [55]:
df.loc[3]

year                 2014
continent            asia
country          Pakistan
income             4619.0
life_exp             65.6
population    185546257.0
Name: 3, dtype: object

In [61]:
df.loc[ 100, 'country' ]

'Namibia'

In [62]:
df.loc[ [ 10, 100, 1000 ], ['continent', 'country'] ]

Unnamed: 0,continent,country
10,asia,Papua New Guinea
100,africa,Namibia
1000,africa,Cape Verde


Conditional Selection

In [65]:
df[ df['income'] > 50000 ]

Unnamed: 0,year,continent,country,income,life_exp,population
6,2014,europe,Norway,64020.0,82.00,5140311.0
27,2014,asia,"Macao, China",142893.0,80.61,588781.0
53,2014,asia,"Hong Kong, China",52552.0,83.56,7194563.0
56,2014,europe,Luxembourg,88203.0,82.10,556316.0
77,2014,asia,Kuwait,83394.0,80.20,3782450.0
...,...,...,...,...,...,...
15196,1954,asia,Brunei,57771.0,58.83,60120.0
15455,1953,asia,Brunei,56876.0,58.22,56968.0
15629,1952,asia,Brunei,55994.0,57.60,53927.0
15951,1951,asia,Brunei,55126.0,56.99,50961.0


In [71]:
# multiple conditions : & (and) , | (or)
df[  ( df['income'] > 50000 ) & ( df['life_exp'] > 80 )  ]

Unnamed: 0,year,continent,country,income,life_exp,population
6,2014,europe,Norway,64020.0,82.0,5140311.0
27,2014,asia,"Macao, China",142893.0,80.61,588781.0
53,2014,asia,"Hong Kong, China",52552.0,83.56,7194563.0
56,2014,europe,Luxembourg,88203.0,82.1,556316.0
77,2014,asia,Kuwait,83394.0,80.2,3782450.0
153,2014,europe,Switzerland,55776.0,82.9,8229629.0
165,2014,asia,Singapore,78958.0,81.9,5448342.0
264,2013,asia,"Macao, China",136540.0,80.4,575841.0
265,2013,europe,Luxembourg,88850.0,81.9,544721.0
280,2013,europe,Norway,63322.0,81.6,5077101.0


Setting and resetting indices

In [90]:
df.set_index('year')

Unnamed: 0_level_0,continent,country,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,asia,Philippines,6598.0,70.7,100102249.0
2014,americas,Paraguay,8038.0,74.3,6552584.0
2014,asia,Palau,14078.0,,21094.0
2014,asia,Pakistan,4619.0,65.6,185546257.0
2014,americas,St.-Pierre-et-Miquelon,,,6277.0
...,...,...,...,...,...
1800,americas,St.-Pierre-et-Miquelon,,,1782.0
1800,europe,Svalbard,,,50.0
1800,asia,Tokelau,,,1009.0
1800,asia,United Korea (former),,,13740000.0


In [111]:
df.dropna().reset_index()

Unnamed: 0,index,year,continent,country,income,life_exp,population
0,0,2014,asia,Philippines,6598.0,70.70,100102249.0
1,1,2014,americas,Paraguay,8038.0,74.30,6552584.0
2,3,2014,asia,Pakistan,4619.0,65.60,185546257.0
3,5,2014,americas,Brazil,15412.0,74.30,204213133.0
4,6,2014,europe,Norway,64020.0,82.00,5140311.0
...,...,...,...,...,...,...,...
40925,54456,1800,americas,Uruguay,1758.0,32.90,55000.0
40926,54457,1800,asia,Uzbekistan,502.0,26.93,1919159.0
40927,54458,1800,asia,Vanuatu,585.0,24.30,27791.0
40928,54459,1800,americas,Venezuela,682.0,32.20,718000.0


Missing Data

In [134]:
df['income'].fillna( value = 0 )

0         6598.0
1         8038.0
2        14078.0
3         4619.0
4            0.0
          ...   
54484        0.0
54485        0.0
54486        0.0
54487        0.0
54488        0.0
Name: income, Length: 54489, dtype: float64

Groupby

In [133]:
by_year = df.groupby('year')
by_year

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9917224280>

In [141]:
by_year.mean().head()

Unnamed: 0_level_0,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1800,748.834646,31.48602,4149630.0
1801,749.129921,31.448905,4167524.0
1802,751.338583,31.463483,4185563.0
1803,751.133858,31.377413,4203777.0
1804,752.366142,31.446318,4222136.0


In [144]:
by_year.mean().tail()

Unnamed: 0_level_0,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,15040.801688,70.969904,29483600.0
2011,15776.583691,71.324375,30488710.0
2012,15861.715517,71.663077,30857230.0
2013,16017.314655,71.916106,31226100.0
2014,16299.771552,72.088125,31593990.0


In [146]:
by_year.describe()

Unnamed: 0_level_0,income,income,income,income,income,income,income,income,life_exp,life_exp,life_exp,life_exp,life_exp,population,population,population,population,population,population,population,population
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1800,254.0,748.834646,589.778786,0.0,425.75,717.5,1004.75,4235.0,201.0,31.486020,...,33.900,42.85,254.0,4.149630e+06,2.320847e+07,0.0,31750.00,396432.5,1989821.00,3.216750e+08
1801,254.0,749.129921,588.657381,0.0,426.00,717.5,1005.50,4161.0,201.0,31.448905,...,33.900,40.30,254.0,4.167524e+06,2.336951e+07,0.0,31750.00,396432.5,1996964.50,3.244089e+08
1802,254.0,751.338583,596.003067,0.0,426.00,718.0,1006.50,4391.0,201.0,31.463483,...,33.900,44.37,254.0,4.185563e+06,2.353204e+07,0.0,31750.00,396432.5,1998352.00,3.271659e+08
1803,254.0,751.133858,592.465757,0.0,426.25,718.5,1008.50,4297.0,201.0,31.377413,...,33.800,44.84,254.0,4.203777e+06,2.369607e+07,0.0,31750.00,396432.5,1999743.50,3.299465e+08
1804,254.0,752.366142,597.725190,0.0,426.25,719.0,1011.50,4502.0,201.0,31.446318,...,33.870,42.83,254.0,4.222136e+06,2.386161e+07,0.0,31750.00,396432.5,2003416.25,3.327506e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010,237.0,15040.801688,19228.864227,0.0,1667.00,7708.0,20479.00,127984.0,208.0,70.969904,...,77.650,84.70,236.0,2.948360e+07,1.236949e+08,50.0,363591.25,4586104.0,17737639.25,1.359755e+09
2011,233.0,15776.583691,20009.992720,0.0,1975.00,8485.0,22333.00,133734.0,208.0,71.324375,...,77.825,84.70,231.0,3.048871e+07,1.261147e+08,796.0,434907.00,5174061.0,20303992.00,1.367480e+09
2012,232.0,15861.715517,20093.672163,0.0,2018.75,8688.5,22414.50,130990.0,208.0,71.663077,...,78.125,84.70,231.0,3.085723e+07,1.272596e+08,804.0,436195.50,5267839.0,20295978.00,1.375199e+09
2013,232.0,16017.314655,20359.353760,0.0,2051.50,8944.0,22535.75,136540.0,208.0,71.916106,...,78.300,84.80,231.0,3.122610e+07,1.283877e+08,801.0,437292.00,5360837.0,19938671.00,1.382793e+09


In [143]:
by_year.describe().tail()

Unnamed: 0_level_0,income,income,income,income,income,income,income,income,life_exp,life_exp,life_exp,life_exp,life_exp,population,population,population,population,population,population,population,population
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,237.0,15040.801688,19228.864227,0.0,1667.0,7708.0,20479.0,127984.0,208.0,70.969904,...,77.65,84.7,236.0,29483600.0,123694900.0,50.0,363591.25,4586104.0,17737639.25,1359755000.0
2011,233.0,15776.583691,20009.99272,0.0,1975.0,8485.0,22333.0,133734.0,208.0,71.324375,...,77.825,84.7,231.0,30488710.0,126114700.0,796.0,434907.0,5174061.0,20303992.0,1367480000.0
2012,232.0,15861.715517,20093.672163,0.0,2018.75,8688.5,22414.5,130990.0,208.0,71.663077,...,78.125,84.7,231.0,30857230.0,127259600.0,804.0,436195.5,5267839.0,20295978.0,1375199000.0
2013,232.0,16017.314655,20359.35376,0.0,2051.5,8944.0,22535.75,136540.0,208.0,71.916106,...,78.3,84.8,231.0,31226100.0,128387700.0,801.0,437292.0,5360837.0,19938671.0,1382793000.0
2014,232.0,16299.771552,20819.536831,0.0,2165.0,9158.0,23142.25,142893.0,208.0,72.088125,...,78.4,84.8,231.0,31593990.0,129497000.0,800.0,438227.0,5448342.0,19587913.0,1390110000.0


In [147]:
by_year.describe()[ 'income' ]

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1800,254.0,748.834646,589.778786,0.0,425.75,717.5,1004.75,4235.0
1801,254.0,749.129921,588.657381,0.0,426.00,717.5,1005.50,4161.0
1802,254.0,751.338583,596.003067,0.0,426.00,718.0,1006.50,4391.0
1803,254.0,751.133858,592.465757,0.0,426.25,718.5,1008.50,4297.0
1804,254.0,752.366142,597.725190,0.0,426.25,719.0,1011.50,4502.0
...,...,...,...,...,...,...,...,...
2010,237.0,15040.801688,19228.864227,0.0,1667.00,7708.0,20479.00,127984.0
2011,233.0,15776.583691,20009.992720,0.0,1975.00,8485.0,22333.00,133734.0
2012,232.0,15861.715517,20093.672163,0.0,2018.75,8688.5,22414.50,130990.0
2013,232.0,16017.314655,20359.353760,0.0,2051.50,8944.0,22535.75,136540.0


Multi-Index and Index Hierarchy

In [153]:
df['continent'].unique()

array(['asia', 'americas', 'europe', 'africa'], dtype=object)

In [155]:
df['continent'].nunique()

4

In [157]:
df['continent'].value_counts()

asia        16304
europe      14335
africa      12679
americas    11171
Name: continent, dtype: int64

In [158]:
outside = df['year']
inside = df['continent']

In [161]:
asia_df = df[ df['continent'] == 'asia' ]
amer_df = df[ df['continent'] == 'americas' ]
eur_df = df[ df['continent'] == 'europe' ]
afr_df = df[ df['continent'] == 'africa' ]

In [164]:
asia_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16304 entries, 0 to 54487
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        16304 non-null  int64  
 1   continent   16304 non-null  object 
 2   country     16304 non-null  object 
 3   income      16304 non-null  float64
 4   life_exp    12761 non-null  float64
 5   population  16304 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 891.6+ KB


In [165]:
asia_df

Unnamed: 0,year,continent,country,income,life_exp,population
0,2014,asia,Philippines,6598.0,70.7,100102249.0
2,2014,asia,Palau,14078.0,,21094.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0
10,2014,asia,Papua New Guinea,2510.0,60.5,7755785.0
11,2014,asia,Taiwan,41376.0,79.4,23184000.0
...,...,...,...,...,...,...
54477,1800,asia,Norfolk Island,0.0,,1121.0
54478,1800,asia,Northern Mariana Islands,0.0,,3360.0
54480,1800,asia,South Yemen (former),0.0,,551928.0
54486,1800,asia,Tokelau,0.0,,1009.0


In [169]:
asia_df.groupby( by = 'year' ).mean()

Unnamed: 0_level_0,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1800,673.565789,29.224576,8.571324e+06
1801,673.986842,29.224407,8.613720e+06
1802,674.500000,29.216780,8.656434e+06
1803,674.907895,29.199661,8.699470e+06
1804,675.355263,29.191017,8.742830e+06
...,...,...,...
2010,16421.083333,72.327097,5.751951e+07
2011,17842.014493,72.521129,6.067650e+07
2012,18219.231884,72.676290,6.132942e+07
2013,18571.869565,72.913710,6.197603e+07
