# 누락 데이터 확인

In [1]:
import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
# value_counts()를 통해 데이터의 개수를 확인 / dropna=False 옵션을 통해 NAN값도 출력
nan_deck = df['deck'].value_counts(dropna=False)
print(nan_deck)

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64


In [8]:
# null 값이면 True 아니면 False
print(df.head().isnull())

   survived  pclass    sex    age  sibsp  parch   fare  embarked  class  \
0     False   False  False  False  False  False  False     False  False   
1     False   False  False  False  False  False  False     False  False   
2     False   False  False  False  False  False  False     False  False   
3     False   False  False  False  False  False  False     False  False   
4     False   False  False  False  False  False  False     False  False   

     who  adult_male   deck  embark_town  alive  alone  
0  False       False   True        False  False  False  
1  False       False  False        False  False  False  
2  False       False   True        False  False  False  
3  False       False  False        False  False  False  
4  False       False   True        False  False  False  


In [9]:
# null 이면 False 아니면 True
print(df.head().notnull())

   survived  pclass   sex   age  sibsp  parch  fare  embarked  class   who  \
0      True    True  True  True   True   True  True      True   True  True   
1      True    True  True  True   True   True  True      True   True  True   
2      True    True  True  True   True   True  True      True   True  True   
3      True    True  True  True   True   True  True      True   True  True   
4      True    True  True  True   True   True  True      True   True  True   

   adult_male   deck  embark_town  alive  alone  
0        True  False         True   True   True  
1        True   True         True   True   True  
2        True  False         True   True   True  
3        True   True         True   True   True  
4        True  False         True   True   True  


In [10]:
# isnull() 메소드로 누락 데이터 개수 구하기
print(df.head().isnull().sum(axis=0))

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           3
embark_town    0
alive          0
alone          0
dtype: int64


# 누락 데이터 제거

In [16]:
missing_df = df.isnull()
for col in missing_df.columns :
    missing_count = missing_df[col].value_counts()
    try :
        print(col,':',missing_count[True])
    except :
        print(col,':',0)

survived : 0
pclass : 0
sex : 0
age : 177
sibsp : 0
parch : 0
fare : 0
embarked : 2
class : 0
who : 0
adult_male : 0
deck : 688
embark_town : 2
alive : 0
alone : 0


# 누락 데이터 제거

In [17]:
# NAN 값이 500개 이상인 열을 모두 삭제
df_thresh = df.dropna(axis=1, thresh=500)
print(df_thresh.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


In [18]:
# age 열에 나이 데이터가 없는 모든 행 삭제 - age 열(891개 중 177개의 NAN 값)
df_age = df.dropna(subset=['age'],how='any',axis=0)
print(len(df_age))

714


# 누락 데이터 치환
## 대체할 값으로는 데이터의 분포와 특성을 잘 나타낼 수 있는 평균값, 최빈값 등을 활용한다.

In [22]:
print(df['age'].head(10))
print('\n')

mean_age =df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace=True)

print(df['age'].head(10))

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64


0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64


In [27]:
df = sns.load_dataset('titanic')
print(df['embark_town'][825:830])
print('\n')

# value_counts() 메소드를 사용하여 승선도시별 승객 수를 찾고, idxmax() 메소드로 가장 큰 값을 갖는 도시를 찾는다
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_freq)
print('\n')

df['embark_town'].fillna(most_freq, inplace=True)

print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object


Southampton


825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object


In [29]:
df = sns.load_dataset('titanic')
print(df['embark_town'][825:830])
print('\n')

df['embark_town'].fillna(method='ffill',inplace=True)
print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object


825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829     Queenstown
Name: embark_town, dtype: object


# 중복 데이터 처리

In [30]:
# 중복 데이터 확인
import pandas as pd

df = pd.DataFrame({'c1':['a','a','b','a','b'],'c2':[1,1,1,2,2],'c3':[1,1,2,2,2]})
print(df)
print('\n')

df_dup = df.duplicated()
print(df_dup)

  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


0    False
1     True
2    False
3    False
4    False
dtype: bool


In [31]:
col_dup = df['c2'].duplicated()
print(col_dup)

0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool


In [33]:
# 중복 데이터 제거
df2 = df.drop_duplicates()
print(df2)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [34]:
df3 = df.drop_duplicates(subset=['c2','c3'])
print(df3)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2


# 데이터 표준화

In [36]:
# 단위 환산
df = pd.read_csv('C:\\Users\\rladl\\Jupyter.study\\05000266\\part5\\auto-mpg.csv', header=None)

df.columns = ['mpg','cylinders','displacemnet','horsepower','weight','acceleration','model year','origin','name']
print(df.head(3))
print('\n')
mpg_to_kpl = 1.60934/3.78541

df['kpl'] = df['mpg']*mpg_to_kpl
print(df.head(3))
print('\n')

df['kpl'] = df['kpl'].round(2)
print(df.head(3))

    mpg  cylinders  displacemnet horsepower  weight  acceleration  model year  \
0  18.0          8         307.0      130.0  3504.0          12.0          70   
1  15.0          8         350.0      165.0  3693.0          11.5          70   
2  18.0          8         318.0      150.0  3436.0          11.0          70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  


    mpg  cylinders  displacemnet horsepower  weight  acceleration  model year  \
0  18.0          8         307.0      130.0  3504.0          12.0          70   
1  15.0          8         350.0      165.0  3693.0          11.5          70   
2  18.0          8         318.0      150.0  3436.0          11.0          70   

   origin                       name       kpl  
0       1  chevrolet chevelle malibu  7.652571  
1       1          buick skylark 320  6.377143  
2       1         plymouth satellite  7.65257

# 자료형 변환

In [37]:
# 자료형 확인
print(df.dtypes)

mpg             float64
cylinders         int64
displacemnet    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
name             object
kpl             float64
dtype: object


In [38]:
# horsepower 열의 고유값 확인
print(df['horsepower'].unique())

['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0'
 '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00'
 '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0'
 '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00'
 '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0'
 '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'
 '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'
 '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'
 '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'
 '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'
 '64.00' '74.00' '116.0' '82.00']


## 누락 데이터가 NAN으로 표시되지 않은 경우
### df.replace('?',np.nan,inplace=True)

In [39]:
import numpy as np
df['horsepower'].replace('?',np.nan, inplace=True)
df.dropna(subset=['horsepower'],axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')

print(df['horsepower'].dtypes)

float64


In [41]:
print(df['origin'].unique())

df['origin'].replace({1:'USA',2:'EU',3:'JPN'}, inplace=True)

print(df['origin'].unique())
print(df['origin'].dtypes)

['USA' 'JPN' 'EU']
['USA' 'JPN' 'EU']
object


### 자료형 변환 astype() / int, float, str, category

# 범주형(카테고리) 데이터 처리 / 구간 분할

In [43]:
df = pd.read_csv('C:\\Users\\rladl\\Jupyter.study\\05000266\\part5\\auto-mpg.csv', header=None)

df.columns = ['mpg','cylinders','displacemnet','horsepower','weight','acceleration','model year','origin','name']

df['horsepower'].replace('?',np.nan, inplace=True)
df.dropna(subset=['horsepower'],axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')

count, bin_dividers = np.histogram(df['horsepower'],bins=3)
print(bin_dividers)

[ 46.         107.33333333 168.66666667 230.        ]


In [45]:
# 3개의 bin에 이름 지정
bin_names = ['저출력','보통출력','고출력']

# pd.cut 함수로 각 데이터를 3개의 bin에 할당
df['hp_bin'] = pd.cut(x=df['horsepower'],
                     bins=bin_dividers, labels=bin_names,include_lowest=True)
print(df[['horsepower','hp_bin']].head(10))

   horsepower hp_bin
0       130.0   보통출력
1       165.0   보통출력
2       150.0   보통출력
3       150.0   보통출력
4       140.0   보통출력
5       198.0    고출력
6       220.0    고출력
7       215.0    고출력
8       225.0    고출력
9       190.0    고출력


# 더미 변수
*   컴퓨터가 인식 할 수 있는 값으로 변환 / 숫자 0 , 1로 표현

In [46]:
horsepower_dummies = pd.get_dummies(df['hp_bin'])
print(horsepower_dummies)

     저출력  보통출력  고출력
0      0     1    0
1      0     1    0
2      0     1    0
3      0     1    0
4      0     1    0
..   ...   ...  ...
393    1     0    0
394    1     0    0
395    1     0    0
396    1     0    0
397    1     0    0

[392 rows x 3 columns]


## 시계열 데이터

In [1]:
import pandas as pd

# 파일 불러오기
df = pd.read_csv(r"C:\Users\rladl\Jupyter.study\05000266\part5\stock-data.csv")

# 데이터 내용 및 자료형 확인
print(df.head())
print('\n')
print(df.info())

         Date  Close  Start   High    Low  Volume
0  2018-07-02  10100  10850  10900  10000  137977
1  2018-06-29  10700  10550  10900   9990  170253
2  2018-06-28  10400  10900  10950  10150  155769
3  2018-06-27  10900  10800  11050  10500  133548
4  2018-06-26  10800  10900  11000  10700   63039


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    20 non-null     object
 1   Close   20 non-null     int64 
 2   Start   20 non-null     int64 
 3   High    20 non-null     int64 
 4   Low     20 non-null     int64 
 5   Volume  20 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB
None


In [2]:
# 문자열 데이터를 판다스 Timestamp로 변환
df['new_Date'] = pd.to_datetime(df['Date'])

print(df.head())
print('\n')
print(df.info())
print('\n')
print(type(df['new_Date'][0]))

         Date  Close  Start   High    Low  Volume   new_Date
0  2018-07-02  10100  10850  10900  10000  137977 2018-07-02
1  2018-06-29  10700  10550  10900   9990  170253 2018-06-29
2  2018-06-28  10400  10900  10950  10150  155769 2018-06-28
3  2018-06-27  10900  10800  11050  10500  133548 2018-06-27
4  2018-06-26  10800  10900  11000  10700   63039 2018-06-26


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      20 non-null     object        
 1   Close     20 non-null     int64         
 2   Start     20 non-null     int64         
 3   High      20 non-null     int64         
 4   Low       20 non-null     int64         
 5   Volume    20 non-null     int64         
 6   new_Date  20 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 1.2+ KB
None


<class 'pandas._libs.tslibs.timestamps.T

In [3]:
# 시계열 값으로 변환된 열을 새로운 행 인덱스로 지정
df.set_index('new_Date', inplace=True)
df.drop('Date', axis=1, inplace=True)

# 데이터 내용 및 자료형 확인
print(df.head())
print('\n')
print(df.info())

            Close  Start   High    Low  Volume
new_Date                                      
2018-07-02  10100  10850  10900  10000  137977
2018-06-29  10700  10550  10900   9990  170253
2018-06-28  10400  10900  10950  10150  155769
2018-06-27  10900  10800  11050  10500  133548
2018-06-26  10800  10900  11000  10700   63039


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20 entries, 2018-07-02 to 2018-06-01
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Close   20 non-null     int64
 1   Start   20 non-null     int64
 2   High    20 non-null     int64
 3   Low     20 non-null     int64
 4   Volume  20 non-null     int64
dtypes: int64(5)
memory usage: 960.0 bytes
None


## Timestamp를 Period로 변환

In [5]:
# 날짜 형식의 문자열로 구성되는 리스트 정의
dates = ['2019-01-01', '2020-03-01', '2021-06-01']

# 문자열의 배열을 판다스 Timestamp로 변환
ts_dates = pd.to_datetime(dates)
print(ts_dates)
print('\n')

# Timestamp를 Period로 변환
pr_day = ts_dates.to_period(freq='D')
print(pr_day)
pr_month = ts_dates.to_period(freq='M')
print(pr_month)
pr_year = ts_dates.to_period(freq='A')
print(pr_year)

DatetimeIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='datetime64[ns]', freq=None)


PeriodIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='period[D]')
PeriodIndex(['2019-01', '2020-03', '2021-06'], dtype='period[M]')
PeriodIndex(['2019', '2020', '2021'], dtype='period[A-DEC]')


## Timestamp 배열 만들기

In [6]:
ts_ms = pd.date_range(start='2019-01-01',
                    end=None,
                    periods=6,
                    freq='MS',
                    tz='Asia/Seoul')
print(ts_ms)

DatetimeIndex(['2019-01-01 00:00:00+09:00', '2019-02-01 00:00:00+09:00',
               '2019-03-01 00:00:00+09:00', '2019-04-01 00:00:00+09:00',
               '2019-05-01 00:00:00+09:00', '2019-06-01 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='MS')


In [7]:
ts_me = pd.date_range(start='2019-01-01',
                    end=None,
                    periods=6,
                    freq='M',
                    tz='Asia/Seoul')
print(ts_me)

DatetimeIndex(['2019-01-31 00:00:00+09:00', '2019-02-28 00:00:00+09:00',
               '2019-03-31 00:00:00+09:00', '2019-04-30 00:00:00+09:00',
               '2019-05-31 00:00:00+09:00', '2019-06-30 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='M')


In [8]:
pr_m = pd.date_range(start='2019-01-01',
                    end=None,
                    periods=3,
                    freq='M',)
print(pr_m)

DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31'], dtype='datetime64[ns]', freq='M')


## 날짜 데이터 분리

In [11]:
df = pd.read_csv(r"C:\Users\rladl\Jupyter.study\05000266\part5\stock-data.csv")

df['new_Date'] = pd.to_datetime(df['Date'])
print(df.head())

         Date  Close  Start   High    Low  Volume   new_Date
0  2018-07-02  10100  10850  10900  10000  137977 2018-07-02
1  2018-06-29  10700  10550  10900   9990  170253 2018-06-29
2  2018-06-28  10400  10900  10950  10150  155769 2018-06-28
3  2018-06-27  10900  10800  11050  10500  133548 2018-06-27
4  2018-06-26  10800  10900  11000  10700   63039 2018-06-26


In [12]:
df['Year'] = df['new_Date'].dt.year
df['Month'] = df['new_Date'].dt.month
df['Day'] = df['new_Date'].dt.day
print(df.head())

         Date  Close  Start   High    Low  Volume   new_Date  Year  Month  Day
0  2018-07-02  10100  10850  10900  10000  137977 2018-07-02  2018      7    2
1  2018-06-29  10700  10550  10900   9990  170253 2018-06-29  2018      6   29
2  2018-06-28  10400  10900  10950  10150  155769 2018-06-28  2018      6   28
3  2018-06-27  10900  10800  11050  10500  133548 2018-06-27  2018      6   27
4  2018-06-26  10800  10900  11000  10700   63039 2018-06-26  2018      6   26


In [13]:
df['Date_yr'] = df['new_Date'].dt.to_period(freq='A')
df['Date_m'] = df['new_Date'].dt.to_period(freq='M')
df.head()

Unnamed: 0,Date,Close,Start,High,Low,Volume,new_Date,Year,Month,Day,Date_yr,Date_m
0,2018-07-02,10100,10850,10900,10000,137977,2018-07-02,2018,7,2,2018,2018-07
1,2018-06-29,10700,10550,10900,9990,170253,2018-06-29,2018,6,29,2018,2018-06
2,2018-06-28,10400,10900,10950,10150,155769,2018-06-28,2018,6,28,2018,2018-06
3,2018-06-27,10900,10800,11050,10500,133548,2018-06-27,2018,6,27,2018,2018-06
4,2018-06-26,10800,10900,11000,10700,63039,2018-06-26,2018,6,26,2018,2018-06


In [14]:
df.set_index('Date_m', inplace=True)
df.head()

Unnamed: 0_level_0,Date,Close,Start,High,Low,Volume,new_Date,Year,Month,Day,Date_yr
Date_m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-07,2018-07-02,10100,10850,10900,10000,137977,2018-07-02,2018,7,2,2018
2018-06,2018-06-29,10700,10550,10900,9990,170253,2018-06-29,2018,6,29,2018
2018-06,2018-06-28,10400,10900,10950,10150,155769,2018-06-28,2018,6,28,2018
2018-06,2018-06-27,10900,10800,11050,10500,133548,2018-06-27,2018,6,27,2018
2018-06,2018-06-26,10800,10900,11000,10700,63039,2018-06-26,2018,6,26,2018


# 그룹 연산
## groupby() 는 데이터프레임의 특정 열을 기준으로 데이터프레임을 분할하여 그룹 객체를 반환한다

In [1]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','sex','class','fare','survived']]

print(df.head())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0


In [2]:
grouped = df.groupby(['class'])
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000218A2FA28B0>


In [4]:
for key, group in grouped :
    print('* key :',key)
    print('* number :', len(group))
    print(group.head(),'\n')

* key : First
* number : 216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1 

* key : Second
* number : 184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1 

* key : Third
* number : 491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0 



In [5]:
# 그룹 연산
average = grouped.mean()
print(average)

              age       fare  survived
class                                 
First   38.233441  84.154687  0.629630
Second  29.877630  20.662183  0.472826
Third   25.140620  13.675550  0.242363


## get_group() 특정 그룹만을 선택할 수 있다.

In [7]:
group3 = grouped.get_group('Third')
print(group3.head())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0


## 여러 열을 기준으로 그룹화

In [8]:
grouped_two = df.groupby(['class','sex'])

for key, group in grouped_two :
    print('* key :',key)
    print('* number :', len(group))
    print(group.head(),'\n')

* key : ('First', 'female')
* number : 94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1 

* key : ('First', 'male')
* number : 122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0 

* key : ('Second', 'female')
* number : 76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
43   3.0  female  Second  41.5792         1
53  29.0  female  Second  26.0000         1 

* key : ('Second', 'male')
* number : 108
     age   sex   class  fare  survived
17   Na

In [9]:
average_two = grouped_two.mean()
print(average_two)

                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447


In [10]:
group3f = grouped_two.get_group(('Third','female'))
print(group3f.head())

     age     sex  class     fare  survived
2   26.0  female  Third   7.9250         1
8   27.0  female  Third  11.1333         1
10   4.0  female  Third  16.7000         1
14  14.0  female  Third   7.8542         0
18  31.0  female  Third  18.0000         0


In [11]:
std_all = grouped.std()
print(std_all)
print('\n')

std_fare = grouped.fare.std()
print(std_fare)

              age       fare  survived
class                                 
First   14.802856  78.380373  0.484026
Second  14.001077  13.417399  0.500623
Third   12.495398  11.778142  0.428949


class
First     78.380373
Second    13.417399
Third     11.778142
Name: fare, dtype: float64


## agg() 사용자 정의 함수를 그룹 객체에 적용 or 그룹별 데이터에 연산을 위한 함수를 구분 적용

*   (1) 각각의 열에 여러 개의 함수를 일괄 적용할 때는 리스트 형태로 인수를 전달
*   (2) 열마다 다른 종류의 함수를 적용하려면 {열:함수} 형태의 딕셔너리를 전달

In [13]:
def min_max(x):
    return x.max() - x.min()
agg_minmax = grouped.agg(min_max)
print(agg_minmax)

          age      fare  survived
class                            
First   79.08  512.3292         1
Second  69.33   73.5000         1
Third   73.58   69.5500         1


  results[key] = self.aggregate(func)


## agg() : 판다스 객체의 개별 원소를 특정 함수에 일대일로 매핑한다.

In [14]:
agg_all = grouped.agg(['min','max'])
print(agg_all.head())
print('\n')

agg_sep = grouped.agg({'fare':['min','max'],'age':'mean'})
print(agg_sep.head())

         age           sex       fare           survived    
         min   max     min   max  min       max      min max
class                                                       
First   0.92  80.0  female  male  0.0  512.3292        0   1
Second  0.67  70.0  female  male  0.0   73.5000        0   1
Third   0.42  74.0  female  male  0.0   69.5500        0   1


       fare                  age
        min       max       mean
class                           
First   0.0  512.3292  38.233441
Second  0.0   73.5000  29.877630
Third   0.0   69.5500  25.140620


## transform() 그룹별로 구분하여 각 원소에 함수를 적용하지만, 그룹별 집계 대신 각 원소의 본래 행 인덱스와 열 이름을 기준으로 연산 결과를 반환한다. => 데이터 프레임에 그룹 단위 통계량 칼럼 추가

In [16]:
def z_score(x):
    return (x - x.mean()/x.std())

age_zscore = grouped.age.transform(z_score)
print(age_zscore.loc[[1,9,0]])
print('\n')
print(len(age_zscore))
print('\n')
print(age_zscore.loc[0:9])

1    35.417158
9    11.866048
0    19.988010
Name: age, dtype: float64


891


0    19.988010
1    35.417158
2    23.988010
3    32.417158
4    32.988010
5          NaN
6    51.417158
7    -0.011990
8    24.988010
9    11.866048
Name: age, dtype: float64


## 그룹 객체 필터링 filter() : 조건식을 가진 함수를 전달하면 조건이 참인 그룹만을 남긴다

In [18]:
grouped_filter = grouped.filter(lambda x : len(x) > 200)
print(grouped_filter.head())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0


In [19]:
age_filter = grouped.filter(lambda x: x.age.mean() < 30)
print(age_filter.tail())

      age     sex   class    fare  survived
884  25.0    male   Third   7.050         0
885  39.0  female   Third  29.125         0
886  27.0    male  Second  13.000         0
888   NaN  female   Third  23.450         0
890  32.0    male   Third   7.750         0


In [20]:
agg_grouped = grouped.apply(lambda x: x.describe())
print(agg_grouped)

                     age        fare    survived
class                                           
First  count  186.000000  216.000000  216.000000
       mean    38.233441   84.154687    0.629630
       std     14.802856   78.380373    0.484026
       min      0.920000    0.000000    0.000000
       25%     27.000000   30.923950    0.000000
       50%     37.000000   60.287500    1.000000
       75%     49.000000   93.500000    1.000000
       max     80.000000  512.329200    1.000000
Second count  173.000000  184.000000  184.000000
       mean    29.877630   20.662183    0.472826
       std     14.001077   13.417399    0.500623
       min      0.670000    0.000000    0.000000
       25%     23.000000   13.000000    0.000000
       50%     29.000000   14.250000    0.000000
       75%     36.000000   26.000000    1.000000
       max     70.000000   73.500000    1.000000
Third  count  355.000000  491.000000  491.000000
       mean    25.140620   13.675550    0.242363
       std     12.49

In [22]:
age_zscore = grouped.age.apply(z_score)
print(age_zscore.head())

0    19.988010
1    35.417158
2    23.988010
3    32.417158
4    32.988010
Name: age, dtype: float64
