# Pandas 101 (with SQL)
이 자료는 데잇걸즈 3기 수업자료로 만들어졌고, [Pandas 10분 완성](https://dataitgirls2.github.io/10minutes2pandas/)을 참고했습니다.  

In [4]:
import pandas as pd
import numpy as np

# TOC
1. Object Creation (객체 생성)
2. Viewing Data (데이터 확인하기)
3. Selection (선택)
4. Missing Data (결측치)

# 1. Object Creation (객체 생성)

## Series

In [5]:
s = pd.Series([1, 2, 3, 4, 5, np.nan, 8])

In [6]:
s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
6    8.0
dtype: float64

In [9]:
# Series s의 길이
len(s)

7

## Dataframe

In [7]:
df = pd.DataFrame({'A': 1, 
                   'B': 'String',
                   'C': s,
                   'D': pd.Series([2, 3, 4, 5, 6, 7, 8])})

In [8]:
df

Unnamed: 0,A,B,C,D
0,1,String,1.0,2
1,1,String,2.0,3
2,1,String,3.0,4
3,1,String,4.0,5
4,1,String,5.0,6
5,1,String,,7
6,1,String,8.0,8


In [10]:
# DataFrame df의 길이
len(df)

7

In [11]:
df.dtypes

A      int64
B     object
C    float64
D      int64
dtype: object

## Read data from outside

In [12]:
house_price = pd.read_csv('https://raw.githubusercontent.com/dataitgirls3/Data/master/house_price_sample.csv')

In [14]:
len(house_price)

1460

# 2. Viewing Data (데이터 확인하기)

## 데이터의 일부를 눈으로 확인하기

In [18]:
# df.head()
# df.head(2)
df.tail()

Unnamed: 0,A,B,C,D
2,1,String,3.0,4
3,1,String,4.0,5
4,1,String,5.0,6
5,1,String,,7
6,1,String,8.0,8


In [34]:
# house_price.head()

### 데이터 프레임의 컬럼만 가져오기

In [20]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [32]:
# house_price.columns

### 데이터 프레임의 값만 가져오기

In [21]:
df.values

array([[1, 'String', 1.0, 2],
       [1, 'String', 2.0, 3],
       [1, 'String', 3.0, 4],
       [1, 'String', 4.0, 5],
       [1, 'String', 5.0, 6],
       [1, 'String', nan, 7],
       [1, 'String', 8.0, 8]], dtype=object)

### 간단 요약통계

In [22]:
df.describe()

Unnamed: 0,A,C,D
count,7.0,6.0,7.0
mean,1.0,3.833333,5.0
std,0.0,2.483277,2.160247
min,1.0,1.0,2.0
25%,1.0,2.25,3.5
50%,1.0,3.5,5.0
75%,1.0,4.75,6.5
max,1.0,8.0,8.0


In [31]:
# house_price.describe()

### 데이터 프레임 전치

In [23]:
df.T

Unnamed: 0,0,1,2,3,4,5,6
A,1,1,1,1,1,1,1
B,String,String,String,String,String,String,String
C,1,2,3,4,5,,8
D,2,3,4,5,6,7,8


### 데이터 정렬하기

In [26]:
df.sort_index(axis=0, ascending=False)
# df.sort_index(axis=1, ascending=False)

Unnamed: 0,A,B,C,D
6,1,String,8.0,8
5,1,String,,7
4,1,String,5.0,6
3,1,String,4.0,5
2,1,String,3.0,4
1,1,String,2.0,3
0,1,String,1.0,2


In [29]:
'''sql
SELECT *
FROM df
ORDER BY df.D DESC
'''

df.sort_values(by='D', ascending=False)

Unnamed: 0,A,B,C,D
6,1,String,8.0,8
5,1,String,,7
4,1,String,5.0,6
3,1,String,4.0,5
2,1,String,3.0,4
1,1,String,2.0,3
0,1,String,1.0,2


In [37]:
# house_price.sort_values(by='SalePrice', ascending=False)

# 3. Selection (선택)

In [39]:
'''sql
SELECT df.C
FROM df
'''

df['C']

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
6    8.0
Name: C, dtype: float64

## Selection by Label

In [47]:
# Selectino by Label하고 헷갈릴 수 있으니까 index를 날짜로 바꿔주겠습니다.
# Pandas에서 날짜는 이렇게 만들 수 있습니다.

df.index = pd.date_range('20190801', periods=7)

In [50]:
dates = df.index

In [55]:
print('dates안의 첫번째 데이터: ' + str(dates[0]))

dates안의 첫번째 데이터: 2019-08-01 00:00:00


In [60]:
'''
df.loc[index, column]
'''

# df.loc[dates[0], :]
df.loc[dates[0]]

A         1
B    String
C         1
D         2
Name: 2019-08-01 00:00:00, dtype: object

In [64]:
df.loc['20190801':'20190803', ['A', 'B']]

Unnamed: 0,A,B
2019-08-01,1,String
2019-08-02,1,String
2019-08-03,1,String


In [63]:
# df.loc[:,'A']
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2019-08-01,1,String
2019-08-02,1,String
2019-08-03,1,String
2019-08-04,1,String
2019-08-05,1,String
2019-08-06,1,String
2019-08-07,1,String


In [67]:
%timeit df.loc['20190801', 'A']
%timeit df.at['20190801', 'A']

172 µs ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
133 µs ± 4.97 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Selection by Position
Selection by Label에서 loc, at을 사용했다면, 데이터의 위치를 이용해 selection을 할 때에는 iloc, iat을 사용합니다. 데이터의 위치를 Index라고 하기 때문입니다.

In [70]:
df.iloc[0]
df.iloc[1]

A         1
B    String
C         2
D         3
Name: 2019-08-02 00:00:00, dtype: object

In [73]:
df.iloc[0:2, 0:2]
df.iloc[[0, 1], [0, 1]]

Unnamed: 0,A,B
2019-08-01,1,String
2019-08-02,1,String


In [76]:
%timeit df.iloc[0, 0]
%timeit df.iat[0, 0]

9.27 µs ± 473 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
5.62 µs ± 98.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Boolean Indexing

In [78]:
# Boolean Indexing 공부를 위해서 기존의 데이터프레임에 컬럼을 하나 추가 해보겠습니다.
df['F'] = ['Sydney', 'Brisbane', 'Brisbane', 'Melbourne', 'Sydney', 'Sydney', 'Sydney']

In [82]:
'''sql
SELECT *
FROM df
WHERE df.F = 'Sydney'
'''

df['F'] == 'Sydney'
# df[df['F'] == 'Sydney']

2019-08-01     True
2019-08-02    False
2019-08-03    False
2019-08-04    False
2019-08-05     True
2019-08-06     True
2019-08-07     True
Freq: D, Name: F, dtype: bool

In [84]:
'''sql
SELECT *
FROM df
WEHRE df.F IN ('Sydney', 'Melbourne')
'''

df[df['F'].isin(['Sydney', 'Melbourne'])]

Unnamed: 0,A,B,C,D,F
2019-08-01,1,String,1.0,2,Sydney
2019-08-04,1,String,4.0,5,Melbourne
2019-08-05,1,String,5.0,6,Sydney
2019-08-06,1,String,,7,Sydney
2019-08-07,1,String,8.0,8,Sydney


In [85]:
'''sql
SELECT *
FROM df
WHERE df.D < 5
'''

df[df['D'] < 5]

Unnamed: 0,A,B,C,D,F
2019-08-01,1,String,1.0,2,Sydney
2019-08-02,1,String,2.0,3,Brisbane
2019-08-03,1,String,3.0,4,Brisbane


# 4. Missing Data

In [87]:
df.dropna()

Unnamed: 0,A,B,C,D,F
2019-08-01,1,String,1.0,2,Sydney
2019-08-02,1,String,2.0,3,Brisbane
2019-08-03,1,String,3.0,4,Brisbane
2019-08-04,1,String,4.0,5,Melbourne
2019-08-05,1,String,5.0,6,Sydney
2019-08-07,1,String,8.0,8,Sydney


In [88]:
df.fillna('This was null')

Unnamed: 0,A,B,C,D,F
2019-08-01,1,String,1,2,Sydney
2019-08-02,1,String,2,3,Brisbane
2019-08-03,1,String,3,4,Brisbane
2019-08-04,1,String,4,5,Melbourne
2019-08-05,1,String,5,6,Sydney
2019-08-06,1,String,This was null,7,Sydney
2019-08-07,1,String,8,8,Sydney


In [90]:
df

Unnamed: 0,A,B,C,D,F
2019-08-01,1,String,1.0,2,Sydney
2019-08-02,1,String,2.0,3,Brisbane
2019-08-03,1,String,3.0,4,Brisbane
2019-08-04,1,String,4.0,5,Melbourne
2019-08-05,1,String,5.0,6,Sydney
2019-08-06,1,String,,7,Sydney
2019-08-07,1,String,8.0,8,Sydney


# 복습은
[Pandas 10분 완성](https://dataitgirls2.github.io/10minutes2pandas/)으로 해주세요.