### pandas
* 특정 데이터들을 DataFrame 형식으로 변환해 준다
* DataFrame는 엑셀과 비슷하게 생각하면 된다
* 낭낭하게 살장 :)


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/1.pandas.csv')

### 데이터 조회
* DataFrame에서 특정데이터만 뽑고자 하는 경우.

####  series
* 1차원 배열형태의 값으로 표현
* DataFrame형식에서 특정 컬럼을 사용하여 Series형식의 특정데이터를 가져온다
* []대괄호 하나로 표현하여 조회


In [3]:
df['name']

0     Kim
1     Cho
2    Choi
3      jo
Name: name, dtype: object

#### DataFrame조회
* DataFrame의 데이터를 Series형식이 아닌 DataFrame형식으로 가져오고자 하는 경우 대괄호 2개를 사용한다.
    - [[값1, 값2 ...]]
    - 여러개의 값 조회시 사용

In [4]:
df[['name', 'score']]

Unnamed: 0,name,score
0,Kim,100.0
1,Cho,95.0
2,Choi,90.5
3,jo,85.7


In [5]:
df[0:3]

Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
1,Cho,95.0,A,30
2,Choi,90.5,B,40


#### loc :데이터프레임.loc[행(index)조건식, 열(column)조건식]
* 열 데이터를 가져온다.


In [6]:
df.loc[1:3, ['name']]

Unnamed: 0,name
1,Cho
2,Choi
3,jo


In [7]:
df.loc[:,'name':'grade']

Unnamed: 0,name,score,grade
0,Kim,100.0,A
1,Cho,95.0,A
2,Choi,90.5,B
3,jo,85.7,B


In [8]:
df.loc[:,]

Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
1,Cho,95.0,A,30
2,Choi,90.5,B,40
3,jo,85.7,B,50


In [9]:
df[df['grade'] == 'A']


Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
1,Cho,95.0,A,30


In [10]:
df['grade'] =='A'

0     True
1     True
2    False
3    False
Name: grade, dtype: bool

### DataFrame 삭제
* 인덱스 기준으로 행 삭제

In [11]:
df.drop(2)

Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
1,Cho,95.0,A,30
3,jo,85.7,B,50


In [12]:
df.drop(1, inplace=True)

In [13]:
df

Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
2,Choi,90.5,B,40
3,jo,85.7,B,50


#### axis
* 열 또는 행의 기준을 잡아준다.
* axis = 0 : index(행) 기준
* axis = 1 : column(열) 기준

In [14]:
df.drop(0,axis=0, inplace=True)

Unnamed: 0,name,score,grade,age
2,Choi,90.5,B,40
3,jo,85.7,B,50


In [16]:
df.drop('name', axis=1,inplace=True)
df

Unnamed: 0,score,grade,age
2,90.5,B,40
3,85.7,B,50


### Boolean 처리
* 특정 데이터들을 연산하면 결과 값은 boolean
* DataFrame에 True의 값을 전달하면 해당 위치의 값들을 가져온다

In [17]:
df =pd.read_csv('data/1.pandas.csv')
df

Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
1,Cho,95.0,A,30
2,Choi,90.5,B,40
3,jo,85.7,B,50


In [18]:
df['score'] < 95

0    False
1    False
2     True
3     True
Name: score, dtype: bool

In [19]:
df[ df['score']<95] 

Unnamed: 0,name,score,grade,age
2,Choi,90.5,B,40
3,jo,85.7,B,50


In [21]:
df[ df['score']<95][['name']]

Unnamed: 0,name
2,Choi
3,jo


* loc 사용

In [23]:
df.loc[ df['score']<95]

Unnamed: 0,name,score,grade,age
2,Choi,90.5,B,40
3,jo,85.7,B,50


In [27]:
df.loc[ df['score']<95, ['name','score']]

Unnamed: 0,name,score
2,Choi,90.5
3,jo,85.7


In [31]:
df[df['score']<95].loc[:3, 'name':'grade']

Unnamed: 0,name,score,grade
2,Choi,90.5,B
3,jo,85.7,B


### DataFrame 명령어

In [37]:
print(df.columns)
df.values

Index(['name', 'score', 'grade', 'age'], dtype='object')


array([['Kim', 100.0, 'A', 20],
       ['Cho', 95.0, 'A', 30],
       ['Choi', 90.5, 'B', 40],
       ['jo', 85.7, 'B', 50]], dtype=object)

* dtypes
    - object : 문자열
    - float : 실수형
    - int : 정수형

In [39]:
df.dtypes

name      object
score    float64
grade     object
age        int64
dtype: object

* isin
    - 특정 값이 있는지 없는지 확인

In [41]:
df['grade']

0    A
1    A
2    B
3    B
Name: grade, dtype: object

In [49]:
df.loc[df['grade'].isin(['A'])]
# df['grade'] == 'A'

Unnamed: 0,name,score,grade,age
0,Kim,100.0,A,20
1,Cho,95.0,A,30


In [50]:
index_name = ['111','222', '333', '홍길동']
df.index = index_name
df

Unnamed: 0,name,score,grade,age
111,Kim,100.0,A,20
222,Cho,95.0,A,30
333,Choi,90.5,B,40
홍길동,jo,85.7,B,50


In [51]:
df[
    '222':'333'
]

Unnamed: 0,name,score,grade,age
222,Cho,95.0,A,30
333,Choi,90.5,B,40


In [52]:
df =pd.read_csv('data/1.quiz.csv')
df

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
0,39060,7313,3627,309,300,B
1,39896,6967,4168,1771,350,B
2,42005,6873,4088,1954,450,A
3,43621,6626,4424,2244,560,A
4,41702,8675,4606,3146,1300,A
5,41266,10622,4984,3945,2395,A
6,32427,9228,5570,5766,3786,C


In [53]:
index_list = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']
df.index = index_list
df

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
2011,39060,7313,3627,309,300,B
2012,39896,6967,4168,1771,350,B
2013,42005,6873,4088,1954,450,A
2014,43621,6626,4424,2244,560,A
2015,41702,8675,4606,3146,1300,A
2016,41266,10622,4984,3945,2395,A
2017,32427,9228,5570,5766,3786,C


In [54]:
df.columns

Index(['경부선KTX', '호남선KTX', '경전선KTX', '전라선KTX', '동해선KTX', '경부선등급'], dtype='object')

In [55]:
df['경부선KTX']

2011    39060
2012    39896
2013    42005
2014    43621
2015    41702
2016    41266
2017    32427
Name: 경부선KTX, dtype: int64

In [59]:
df[['경부선KTX','호남선KTX']]

Unnamed: 0,경부선KTX,호남선KTX
2011,39060,7313
2012,39896,6967
2013,42005,6873
2014,43621,6626
2015,41702,8675
2016,41266,10622
2017,32427,9228


In [115]:
df[1:5]

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
2012,39896,6967,4168,1771,350,B
2013,42005,6873,4088,1954,450,A
2014,43621,6626,4424,2244,560,A
2015,41702,8675,4606,3146,1300,A


In [116]:
df['2012':'2015']

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
2012,39896,6967,4168,1771,350,B
2013,42005,6873,4088,1954,450,A
2014,43621,6626,4424,2244,560,A
2015,41702,8675,4606,3146,1300,A


In [64]:
df[["경부선KTX"]].loc[:]

Unnamed: 0,경부선KTX
2011,39060
2012,39896
2013,42005
2014,43621
2015,41702
2016,41266
2017,32427


In [88]:
df.loc[:'2011',]

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
2011,39060,7313,3627,309,300,B


In [89]:
df.loc['2013':'2016']

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
2013,42005,6873,4088,1954,450,A
2014,43621,6626,4424,2244,560,A
2015,41702,8675,4606,3146,1300,A
2016,41266,10622,4984,3945,2395,A


In [98]:
df[["경부선KTX"]].loc['2012':'2014']

Unnamed: 0,경부선KTX
2012,39896
2013,42005
2014,43621


In [117]:
df['2012':'2014'][['경부선KTX']]

Unnamed: 0,경부선KTX
2012,39896
2013,42005
2014,43621


In [100]:
df[["경부선KTX","호남선KTX","동해선KTX"]].loc['2015':'2017']

Unnamed: 0,경부선KTX,호남선KTX,동해선KTX
2015,41702,8675,1300
2016,41266,10622,2395
2017,32427,9228,3786


In [109]:
df.loc[df["경부선등급"].isin(["B","C"])]

Unnamed: 0,경부선KTX,호남선KTX,경전선KTX,전라선KTX,동해선KTX,경부선등급
2011,39060,7313,3627,309,300,B
2012,39896,6967,4168,1771,350,B
2017,32427,9228,5570,5766,3786,C


In [114]:
df.loc[df["경부선등급"].isin(["B","C"])][["경부선KTX"]]

Unnamed: 0,경부선KTX
2011,39060
2012,39896
2017,32427
