### Pandas 패키지와 Series 객체 

In [3]:
import numpy as np
import pandas as pd

In [4]:
data=pd.Series(np.linspace(0,1,num=5))
data

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [5]:
data.values

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [6]:
type(data.values)

numpy.ndarray

In [7]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [8]:
data[1]

0.25

In [9]:
data[2:4]

2    0.50
3    0.75
dtype: float64

In [10]:
data[(data>0.1)&(data<0.6)]

1    0.25
2    0.50
dtype: float64

In [11]:
data[[2,4]]

2    0.5
4    1.0
dtype: float64

In [13]:
list(data.keys())

[0, 1, 2, 3, 4]

In [14]:
list(data.items())

[(0, 0.0), (1, 0.25), (2, 0.5), (3, 0.75), (4, 1.0)]

In [15]:
data.index=['a','b','c','d','e']
data

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

### Series 객체와 loc인덱서, iloc 인덱서의 활용<br>

**`loc인덱서`** : 명시적 인덱스 사용<br><br>
**`iloc인덱서`** : 암묵적 인덱스 사용

In [16]:
data.loc['a']

0.0

In [18]:
data.loc['a':'c']

a    0.00
b    0.25
c    0.50
dtype: float64

In [20]:
data.loc[['a','c']]

a    0.0
c    0.5
dtype: float64

In [21]:
data.loc[data>0.7]

d    0.75
e    1.00
dtype: float64

In [22]:
data.iloc[0]

0.0

In [24]:
data.iloc[0:3]

a    0.00
b    0.25
c    0.50
dtype: float64

In [25]:
data.iloc[[0,2]]

a    0.0
c    0.5
dtype: float64

### Pandas 패키지와 DataFrame 객체<br>

DataFrame은 행 인덱스와 열이름으로 구성된 2차원 구조

In [33]:
np.random.seed(0)

df = pd.DataFrame(np.random.randint(10,size=(3,4)),columns=['col1','col2','col3','col4'])
df

Unnamed: 0,col1,col2,col3,col4
0,5,0,3,3
1,7,9,3,5
2,2,4,7,6


In [34]:
df['col2']

0    0
1    9
2    4
Name: col2, dtype: int32

In [35]:
df.col2

0    0
1    9
2    4
Name: col2, dtype: int32

### DataFrame객체와 loc인덱서, iloc인덱서의 활용 

In [36]:
df.loc[0,'col2':'col3']

col2    0
col3    3
Name: 0, dtype: int32

In [37]:
df.loc[0:0,'col2':'col3']

Unnamed: 0,col2,col3
0,0,3


In [38]:
df.loc[[0],'col2':'col3']

Unnamed: 0,col2,col3
0,0,3


In [39]:
df.loc[0:2,'col2':'col3']

Unnamed: 0,col2,col3
0,0,3
1,9,3
2,4,7


In [40]:
df.loc[(df['col2']>2)&(df['col3']<5),'col2':'col3']

Unnamed: 0,col2,col3
1,9,3


In [41]:
df.iloc[0,1:3]

col2    0
col3    3
Name: 0, dtype: int32

In [42]:
df.iloc[0:1,1:3]

Unnamed: 0,col2,col3
0,0,3


In [43]:
df.iloc[[0],1:3]

Unnamed: 0,col2,col3
0,0,3


In [44]:
df.iloc[0:3,1:3]

Unnamed: 0,col2,col3
0,0,3
1,9,3
2,4,7


### DataFrame객체의 행과 열 제거 

In [45]:
df

Unnamed: 0,col1,col2,col3,col4
0,5,0,3,3
1,7,9,3,5
2,2,4,7,6


In [46]:
df['total']=df.sum(axis=1)
df

Unnamed: 0,col1,col2,col3,col4,total
0,5,0,3,3,11
1,7,9,3,5,24
2,2,4,7,6,19


In [47]:
df=df.drop(columns=['col4','total'],axis=1)
df

Unnamed: 0,col1,col2,col3
0,5,0,3
1,7,9,3
2,2,4,7


In [48]:
df=df.drop(index=1,axis=0)
df

Unnamed: 0,col1,col2,col3
0,5,0,3
2,2,4,7


### DataFrame 객체의 널값 연산 

In [55]:
df=pd.DataFrame([[1,2,3],
               [4,5,6],
               [np.nan,8,9],
               [10,np.nan,12]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3
1,4.0,5.0,6
2,,8.0,9
3,10.0,,12


In [56]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
0,1.0,2.0,3
1,4.0,5.0,6


In [57]:
df.fillna(df.mean(axis=0))

Unnamed: 0,0,1,2
0,1.0,2.0,3
1,4.0,5.0,6
2,5.0,8.0,9
3,10.0,5.0,12


누락된 값을 `제거하기 vs 대체하기`

### DataFrame 객체의 조인 

In [58]:
df1=pd.DataFrame({'name' : ['이순신','강감찬','을지문덕','김유신'],
                 'dept' : ['연구개발','영업','연구개발','인사']})
df2=pd.DataFrame({'emp_name' : ['강감찬','을지문덕','이순신','이순신'],
                 'project' : ['S','D','A','S']})

In [59]:
pd.merge(df1,df2,left_on='name',right_on='emp_name').drop('emp_name',axis=1)

Unnamed: 0,name,dept,project
0,이순신,연구개발,A
1,이순신,연구개발,S
2,강감찬,영업,S
3,을지문덕,연구개발,D


In [60]:
pd.merge(df1,df2,how='outer',left_on='name',right_on='emp_name').drop('emp_name',axis=1)

Unnamed: 0,name,dept,project
0,이순신,연구개발,A
1,이순신,연구개발,S
2,강감찬,영업,S
3,을지문덕,연구개발,D
4,김유신,인사,


### DataFrame 객체의 정렬 

In [62]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [63]:
titanic.sort_values(by=['fare','sex'],ascending=[False,True]).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
88,1,1,female,23.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False
341,1,1,female,24.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False


### DataFrame 객체의 그룹 연산 

In [64]:
titanic.groupby('sex')[['survived']].aggregate('mean')

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


**`평균을 중심 이동`**해서 결과 표준화

In [65]:
titanic.groupby('sex')[['survived']].aggregate('mean').apply(lambda x : x-x.mean())

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.276565
male,-0.276565


### DataFrame 객체와 피벗테이블 

In [66]:
titanic.groupby(['sex','class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [67]:
titanic.pivot_table('survived',index='sex',columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447
