### Series객체

In [1]:
import pandas as pd
import numpy as np

m2 = pd.read_csv('employee.csv')
m2

Unnamed: 0,ename,deptname,addr,phone
0,kim,관리직,서울시,010-1111-1111
1,park,관리직,부산시,010-2222-2222
2,lee,영업직,광주시,010-3333-3333
3,min,영업직,광주시,010-4444-4444
4,song,개발직,광주시,010-5555-5555


In [2]:
# 컬럼접근
sr = m2['ename']   # Series 객체
sr

0     kim
1    park
2     lee
3     min
4    song
Name: ename, dtype: object

In [17]:
# str.contains() 메서드를 사용하여 문자열의 조건 검색, 행을 추출
sr = m2['ename'].str.contains('k') # 'ename'컬럼의 데이터가 문자'k'를 포함하고 
                                   # 있는가? True/False값의 Series 객체로 반환
print(sr)    
print(type(sr)) # Series
m2[sr]     #  m2[[True,True,False,False,False]]

0     True
1     True
2    False
3    False
4    False
Name: ename, dtype: bool
<class 'pandas.core.series.Series'>


Unnamed: 0,ename,deptname,addr,phone
0,kim,관리직,서울시,010-1111-1111
1,park,관리직,부산시,010-2222-2222


In [13]:
# 'deptname' 이 '영업직'인 행만 추출
m2[m2['deptname'].str.contains('영업직')]

Unnamed: 0,ename,deptname,addr,phone
2,lee,영업직,광주시,010-3333-3333
3,min,영업직,광주시,010-4444-4444


In [14]:
# 'addr' 가 '광주시'인 행만 추출
m2[m2['addr'].str.contains('광주시')]

Unnamed: 0,ename,deptname,addr,phone
2,lee,영업직,광주시,010-3333-3333
3,min,영업직,광주시,010-4444-4444
4,song,개발직,광주시,010-5555-5555


In [15]:
# 조건식 2개 사용
# 'deptname' 이 '영업직'이고 'addr' 가 '광주시'인 행만 추출
m2[m2['deptname'].str.contains('영업직') & m2['addr'].str.contains('광주시')]

Unnamed: 0,ename,deptname,addr,phone
2,lee,영업직,광주시,010-3333-3333
3,min,영업직,광주시,010-4444-4444


### Series 객체의 통계 메서드

In [29]:
score_table = { '영어': [10,20,30,40,50],
                '수학': [70,80,90,30,20]}
df = pd.DataFrame(score_table)
sr = df['영어']  # Series 객체
sr.sum()
sr.max()
sr.min()
sr.mean()
sr.median()
sr.std()
sr.var()
sr.count()
sr.describe()

count     5.000000
mean     30.000000
std      15.811388
min      10.000000
25%      20.000000
50%      30.000000
75%      40.000000
max      50.000000
Name: 영어, dtype: float64

### 누락된 데이터(결측치,NaN,NaT) 다루기

In [58]:
df = pd.read_csv('WHO_first9cols.csv')
len(df) - df.count()  # 결측치의 갯수:전체데이터수 - 결측치를 제외한 데이터의 갯수

df2 = df[['Country',df.columns[-2]]] 
df2

# pd.isnull().pd.notnull()
pd.isnull(df2)    # 결측치는 True로 반환, 나머지는 False로 반환
pd.notnull(df2)   # 결측치는 False로 반환, 나머지는 True로 반환

pd.isnull(df2).sum()  # 23:-2번 컬럼의 결측치의 갯수
pd.isnull(df).sum()   # 전체 컬럼의 결측치의 갯수

# fillna() : 결측치를 특정 값으로 채움
# 결측치를 0으로 채움 : 평균,표준 편차에 영향
df3 = df2.fillna(0)
pd.isnull(df3).sum()  # 0  , 결측치의 갯수는 0

# replace(a,b)
df4 = df2.replace(np.nan,0)  # fillna(0)와 동일한 결과

sr = df2[df2.columns[-1]].replace(np.nan,0)  # 한개의 컬럼만 결측치를 제거
sr

# 결측치를 평균으로 채움 : 평균에 영향이 없음
df5 = df2.fillna(df2.mean())
df5

# df2.mean()  #  85.698324
print(df2.describe())  # 결측치 제거전
print(df4.describe())  # 평균값이 다름
print(df5.describe())  # 평균값이 동일

       Net primary school enrolment ratio male (%)
count                                   179.000000
mean                                     85.698324
std                                      15.451212
min                                      11.000000
25%                                      79.500000
50%                                      90.000000
75%                                      96.000000
max                                     100.000000
       Net primary school enrolment ratio male (%)
count                                   202.000000
mean                                     75.940594
std                                      30.921123
min                                       0.000000
25%                                      73.000000
50%                                      89.500000
75%                                      96.000000
max                                     100.000000
       Net primary school enrolment ratio male (%)
count                          

### Series 객체의 index 사용 및 결측치 제어

In [64]:
index_table = ['tv','vtr','phone']
sr = pd.Series([10,20,30],index=index_table)
print(sr)
print(sr.index)

r = sr.index.str.contains('v')  # index에서 문자열을 검출
sr[r]

tv       10
vtr      20
phone    30
dtype: int64
Index(['tv', 'vtr', 'phone'], dtype='object')


tv     10
vtr    20
dtype: int64

In [77]:
# Series 객체의 요소의 추가
sr2 = sr.append(pd.Series([40,50,60],index=['audio','iron','notebook']))
sr2

# 테스트를 위한 결측치를 추가
sr3 = sr2.append(pd.Series([np.nan],index=['aircon']))
print(sr3)
print(sr3.mean())  # 35, NaN 을 제외하고 평균을 구함

# a = np.array([np.nan,1,2,3]) # ndarray는 nan관 연산하면 결과값이 nan이다
# a.mean()  # nan

# 결측치를 처리(제거)
sr4 = sr3.fillna(0)
print(sr4)
print(sr4.mean()) # 30,  0으로 채우면 평균값이 감소

sr5 = sr3.fillna(sr3.mean())
print(sr5)
print(sr5.mean()) # 35,  평균으로 채우면 평균값 변동이 없음

tv          10.0
vtr         20.0
phone       30.0
audio       40.0
iron        50.0
notebook    60.0
aircon       NaN
dtype: float64
35.0
tv          10.0
vtr         20.0
phone       30.0
audio       40.0
iron        50.0
notebook    60.0
aircon       0.0
dtype: float64
30.0
tv          10.0
vtr         20.0
phone       30.0
audio       40.0
iron        50.0
notebook    60.0
aircon      35.0
dtype: float64
35.0


### groupby()

In [86]:
# import numpy as np
# np.random.seed(55)
# np.random.rand(10,2)

from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
seed(42)

df = pd.DataFrame({'Weather' : ['cold', 'hot', 'cold', 'hot', 'cold', 'hot', 'cold'],
                   'Food' : ['soup', 'soup', 'icecream', 'chocolate', 'icecream', 'icecream', 'soup'],
                   'Price' : 10 * rand(7),       # 0~10 사이,7개의 난수, 실수
                   'Number' : randint(1, 9, 7)}) # 1~9 사이,7개의 난수, 정수
df

Unnamed: 0,Weather,Food,Price,Number
0,cold,soup,3.745401,8
1,hot,soup,9.507143,5
2,cold,icecream,7.319939,4
3,hot,chocolate,5.986585,8
4,cold,icecream,1.560186,8
5,hot,icecream,1.559945,3
6,cold,soup,0.580836,6


In [106]:
weather_group = df.groupby('Weather')  
print(weather_group) # DataFrameGroupBy 객체

i = 0
for name,group in weather_group: # 2개
    i += 1
    print(i,name,'\n',group)
    print(type(group))   # DataFrame
    print('-'*50)

print(weather_group.first()) # 각 그룹(cold와 hot)의 첫번째 행
print('-'*50)
print(weather_group.last())  # 각 그룹(cold와 hot)의 마지막 행
print('-'*50)
print(weather_group.mean())  # 각 그룹(cold와 hot)의 평균
print('-'*50)
print(weather_group.count())  # 각 그룹(cold와 hot)의  총 갯수
print('-'*50)
print(weather_group.describe())  # 각 그룹(cold와 hot)의 통계 요약
print('-'*50)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022276A27CC8>
1 cold 
   Weather      Food     Price  Number
0    cold      soup  3.745401       8
2    cold  icecream  7.319939       4
4    cold  icecream  1.560186       8
6    cold      soup  0.580836       6
<class 'pandas.core.frame.DataFrame'>
--------------------------------------------------
2 hot 
   Weather       Food     Price  Number
1     hot       soup  9.507143       5
3     hot  chocolate  5.986585       8
5     hot   icecream  1.559945       3
<class 'pandas.core.frame.DataFrame'>
--------------------------------------------------
         Food     Price  Number
Weather                        
cold     soup  3.745401       8
hot      soup  9.507143       5
--------------------------------------------------
             Food     Price  Number
Weather                            
cold         soup  0.580836       6
hot      icecream  1.559945       3
--------------------------------------------------
         

In [132]:
wf_group = df.groupby(['Weather','Food']) 
print(wf_group) # DataFrameGroupBy
print(wf_group.groups) # dict

wf_group.agg([np.mean,np.sum,np.min,np.max])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000222764433C8>
{('cold', 'icecream'): Int64Index([2, 4], dtype='int64'), ('cold', 'soup'): Int64Index([0, 6], dtype='int64'), ('hot', 'chocolate'): Int64Index([3], dtype='int64'), ('hot', 'icecream'): Int64Index([5], dtype='int64'), ('hot', 'soup'): Int64Index([1], dtype='int64')}


Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Price,Price,Price,Number,Number,Number,Number
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,amin,amax,mean,sum,amin,amax
Weather,Food,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
cold,icecream,4.440063,8.880126,1.560186,7.319939,6,12,4,8
cold,soup,2.163119,4.326237,0.580836,3.745401,7,14,6,8
hot,chocolate,5.986585,5.986585,5.986585,5.986585,8,8,8,8
hot,icecream,1.559945,1.559945,1.559945,1.559945,3,3,3,3
hot,soup,9.507143,9.507143,9.507143,9.507143,5,5,5,5


In [133]:
gr_df = wf_group.agg([np.mean,np.sum,np.min,np.max])
print(type(gr_df))  # DataFrame (인덱스와 컬럼이 MultiIndex인 데이터 프레임)
print(gr_df.index)  # MultiIndex

print(gr_df.columns) # MultiIndex

# MultiIndex형식의 컬럼 접근 방법-1
df1 = pd.DataFrame(gr_df['Price']['mean'])
df2 = pd.DataFrame(gr_df['Number']['mean'])
df3 = pd.concat([df1,df2],axis=1)
df3

# MultiIndex형식의 컬럼 접근 방법-2
gr_df.loc[:,[('Price', 'mean'),('Number', 'mean')]]

<class 'pandas.core.frame.DataFrame'>
MultiIndex([('cold',  'icecream'),
            ('cold',      'soup'),
            ( 'hot', 'chocolate'),
            ( 'hot',  'icecream'),
            ( 'hot',      'soup')],
           names=['Weather', 'Food'])
MultiIndex([( 'Price', 'mean'),
            ( 'Price',  'sum'),
            ( 'Price', 'amin'),
            ( 'Price', 'amax'),
            ('Number', 'mean'),
            ('Number',  'sum'),
            ('Number', 'amin'),
            ('Number', 'amax')],
           )


Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Number
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean
Weather,Food,Unnamed: 2_level_2,Unnamed: 3_level_2
cold,icecream,4.440063,6
cold,soup,2.163119,7
hot,chocolate,5.986585,8
hot,icecream,1.559945,3
hot,soup,9.507143,5


### pivot_table() 함수

In [136]:
pd.pivot_table(df,columns=['Weather'],aggfunc=np.sum) # DataFrame

Weather,cold,hot
Number,26.0,16.0
Price,13.206363,17.053673


In [143]:
pd.pivot_table(df,columns=['Food'],aggfunc=np.sum)  # DataFrame

Food,chocolate,icecream,soup
Number,8.0,15.0,19.0
Price,5.986585,10.440071,13.83338


In [145]:
pd.pivot_table(df,columns=['Weather','Food'],aggfunc=np.sum) # Series

        Weather  Food     
Price   cold     icecream      8.880126
                 soup          4.326237
        hot      chocolate     5.986585
                 icecream      1.559945
                 soup          9.507143
Number  cold     icecream     12.000000
                 soup         14.000000
        hot      chocolate     8.000000
                 icecream      3.000000
                 soup          5.000000
dtype: float64