In [1]:
import numpy as np, pandas as pd

In [2]:
# 행성 데이터
import seaborn as sns
planets = sns.load_dataset("planets")
print(planets.shape, "\n")
print(planets.head(10), "\n")

(1035, 6) 

            method  number  orbital_period   mass  distance  year
0  Radial Velocity       1         269.300   7.10     77.40  2006
1  Radial Velocity       1         874.774   2.21     56.95  2008
2  Radial Velocity       1         763.000   2.60     19.84  2011
3  Radial Velocity       1         326.030  19.40    110.62  2007
4  Radial Velocity       1         516.220  10.50    119.47  2009
5  Radial Velocity       1         185.840   4.80     76.39  2008
6  Radial Velocity       1        1773.400   4.64     18.15  2002
7  Radial Velocity       1         798.500    NaN     21.41  1996
8  Radial Velocity       1         993.300  10.30     73.10  2008
9  Radial Velocity       2         452.800   1.99     74.79  2010 



In [3]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
print(ser, "\n")
print(ser.sum(), ",", ser.mean())

df = pd.DataFrame({"A": rng.rand(5),
                   "B": rng.rand(5)})
print(df, "\n")
print(df.mean(axis="columns"), "\n")

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64 

2.811925491708157 , 0.5623850983416314
          A         B
0  0.155995  0.020584
1  0.058084  0.969910
2  0.866176  0.832443
3  0.601115  0.212339
4  0.708073  0.181825 

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64 



In [4]:
# 일반적인 집계 연산 - describe()
# 데이터 전반적인 속성 이해
# count(), first(), last(), mean(), median(), min(), max(), std(), var(), mad(), prod(), sum()
print(planets.dropna().describe(), "\n")

          number  orbital_period        mass    distance         year
count  498.00000      498.000000  498.000000  498.000000   498.000000
mean     1.73494      835.778671    2.509320   52.068213  2007.377510
std      1.17572     1469.128259    3.636274   46.596041     4.167284
min      1.00000        1.328300    0.003600    1.350000  1989.000000
25%      1.00000       38.272250    0.212500   24.497500  2005.000000
50%      1.00000      357.000000    1.245000   39.940000  2009.000000
75%      2.00000      999.600000    2.867500   59.332500  2011.000000
max      6.00000    17337.500000   25.000000  354.000000  2014.000000 



In [5]:
#GroupBy: 분할(split), 적용(apply), 결합(combine)
df = pd.DataFrame({"key": ["A","B","C","A","B","C"],
                  "data": range(6)}, columns=["key","data"])
print(df, "\n")

# groupby() -> by coulmn
print(df.groupby("key"), "\n") # lazy evaluation
print(df.groupby("key").sum(), "\n")

  key  data
0   A     0
1   B     1
2   C     2
3   A     3
4   B     4
5   C     5 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027133159780> 

     data
key      
A       3
B       5
C       7 



In [6]:
# GroupBy Object
print(planets.groupby("method"))
print(planets.groupby("method")["orbital_period"], "\n")
print(planets.groupby("method")["orbital_period"].median(), "\n")

for (method, group) in planets.groupby("method"):
    print("{0:30s} shape={1}".format(method, group.shape))
print("\n")

print(planets.groupby("method")["year"].describe().unstack(), "\n")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027154358B50>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000027154358B50> 

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64 

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                sh

In [7]:
# 집계 aggregate(), 필터 filter(), 변환 transform(), 적용 apply()
rng = np.random.RandomState(0)
df = pd.DataFrame({"key": ["A","B","C","A","B","C"],
                   "data1": range(6),
                   "data2": rng.randint(0, 10, 6),},
                  columns=["key","data1","data2"])
print(df, "\n")

# aggregate method
print(df.groupby("key").aggregate([min, np.median, max]), "\n")
print(df.groupby("key").aggregate({"data1": min,
                                   "data2": max,}), "\n")

# filter method
def filter_func(x):
    return x["data2"].std() > 4

print(df); print(df.groupby("key").std(), "\n");
print(df.groupby("key").filter(filter_func), "\n") # group "A" removed

# transform method
print(df.groupby("key").transform(lambda x: x - x.mean()), "\n") # 그룹별 평균값 빼기

# apply method
def norm_by_data2(x):
    # x는 그룹 값을 가지는 DataFrame
    x["data1"] /= x["data2"].sum()
    return x

print(df.groupby("key").apply(norm_by_data2), "\n")

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9 

    data1            data2           
      min median max   min median max
key                                  
A       0    1.5   3     3    4.0   5
B       1    2.5   4     0    3.5   7
C       2    3.5   5     3    6.0   9 

     data1  data2
key              
A        0      5
B        1      7
C        2      9 

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641 

  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9 

   data1  data2
0   -1.5    1.0
1   -1.5   -3.5
2   -1.5   -3.0
3    1.5   -1.0
4    1.5    3.5
5    1.5    3.0 

  key     data1  data2
0   A  0.000000      5
1   B  0.142857  

In [8]:
# 분할 키 지정하기
L = [0, 1, 0, 1, 2, 0]
print(df); print(df.groupby(L).sum(), "\n")
print(df); print(df.groupby(df["key"]).sum(), "\n")

# 인덱스 그룹 매핑
df2 = df.set_index("key")
mapping = {"A": "vowel", "B": "consonant", "C": "consonant"}
print(df2); print(df2.groupby(mapping).sum(), "\n")

# 파이썬 함수
print(df2); print(df2.groupby(str.lower).mean(), "\n")

# 유효한 키의 리스트
print(df2.groupby([str.lower, mapping]).mean(), "\n")

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   data1  data2
0      7     17
1      4      3
2      4      7 

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
     data1  data2
key              
A        3      8
B        5      7
C        7     12 

     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
           data1  data2
key                    
consonant     12     19
vowel          3      8 

     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
     data1  data2
key              
a      1.5    4.0
b      2.5    3.5
c      3.5    6.0 

               data1  data2
key key                    
a   vowel        1.5    4.0


In [11]:
# Grouping 예제
decade = 10 * (planets["year"] // 10)
decade = decade.astype(str) + "s"
decade.name = "decade"
planets["decade"] = decade

planets.groupby(["method", "decade"])["number"].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
