# 004. Basic Pandas

## Pandas 는 Series data type 과 DataFrame data type 으로 구성된다.

### Series (1 차원) : numpy array 와 유사. 
- 차이점 - numpy 와 달리 Series 는 axis (행, 열)에 label 을 부여할 수 있다. 즉, numpy 와 같이 숫자로만 indexing 하는 것이 아니라 label 명으로 indexing 을 할 수 있다. 또한 숫자 뿐 아니라 임의의 Python object 를 모두 element 로 가질 수 있다.


### DataFrame (2차원, table)
- Python program 안의 Excel

### Series vs DataFrame
<img src="series-and-dataframe.width-1200.png" width="600">


<img src="base_01_pandas_5_0.png" width="600">

In [1]:
import numpy as np
import pandas as pd

### Series 생성

- list, numpy array, dictionary 를 모두 Series 로 변환할 수 있다. 
- dictionary 의 경우 key 가 label, value 가 value 로 번환된다.

In [17]:
dict = {'a': 10, 'b': 20, 'c': 30, 'd': 40}

series1 = pd.Series(dict)    # dictionary
series1

a    10
b    20
c    30
d    40
dtype: int64

In [19]:
my_list = [10, 20, 30, 40]
labels = ['a', 'b', 'c', 'd']

series2 = pd.Series(data = my_list, index = labels)     # list
series2

a    10
b    20
c    30
d    40
dtype: int64

In [20]:
arr = np.array([10, 20, 30, 40])

series3 = pd.Series(arr)      # numpy array
series3

0    10
1    20
2    30
3    40
dtype: int64

### Series 의 indexing

In [21]:
series1['b':'c']

b    20
c    30
dtype: int64

In [22]:
series3[1:3]

1    20
2    30
dtype: int64

### Series  간의 연산

In [24]:
series1, series2

(a    10
 b    20
 c    30
 d    40
 dtype: int64,
 a    10
 b    20
 c    30
 d    40
 dtype: int64)

In [25]:
series1 + series2

a    20
b    40
c    60
d    80
dtype: int64

### DataFrame 

DataFrame 은 여러개의 Series 를 같은 index 기준으로 모아 Table 을 만든 것이다.

In [62]:
import pandas as pd
import numpy as np

In [63]:
np.random.seed(101)
data = np.random.randn(5, 4)

In [64]:
df = pd.DataFrame(data, index=['A', 'B', 'C', 'D', 'E'], 
                  columns=['W', 'X', 'Y', 'Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### dataframe 의 column 명

In [65]:
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

### column 별 unique value 의 갯수

In [66]:
df.nunique()

W    5
X    5
Y    5
Z    5
dtype: int64

In [67]:
df['W'].value_counts()

-2.018168    1
 0.651118    1
 0.188695    1
 2.706850    1
 0.190794    1
Name: W, dtype: int64

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
dtypes: float64(4)
memory usage: 200.0+ bytes


In [69]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


### DataFrame indexing

In [70]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [71]:
type(df['W'])

pandas.core.series.Series

In [72]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [73]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

### 조건에 따른 slicing

In [74]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [75]:
df[(df['W'] > 0) | (df['Y'] > 0)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### index values

In [76]:
df.index.values

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

### new column 추가/삭제

In [38]:
df['new'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [39]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [40]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [41]:
df.drop('new', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [42]:
df.drop('D', axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [43]:
df.shape

(4, 4)

### Dictionary Example

In [52]:
songs = {"book": ["춘향전", "무기여 잘있거라", "소나기", "해리포터", "왕좌의 게임", "알고리즘파이썬"],
         "year": [1600, 1930, 1940, 1990, 2000, 2010],
         "pages": [50, 100, 100, 1000, 2000, 300]}

songs_df = pd.DataFrame(songs)
songs_df

Unnamed: 0,book,year,pages
0,춘향전,1600,50
1,무기여 잘있거라,1930,100
2,소나기,1940,100
3,해리포터,1990,1000
4,왕좌의 게임,2000,2000
5,알고리즘파이썬,2010,300


- page 가 많은 순서로 sort

In [54]:
songs_df.sort_values('pages', ascending=False, inplace=True)
songs_df

Unnamed: 0,book,year,pages
4,왕좌의 게임,2000,2000
3,해리포터,1990,1000
5,알고리즘파이썬,2010,300
1,무기여 잘있거라,1930,100
2,소나기,1940,100
0,춘향전,1600,50


In [55]:
y = songs_df[['book', 'year']]
y

Unnamed: 0,book,year
4,왕좌의 게임,2000
3,해리포터,1990
5,알고리즘파이썬,2010
1,무기여 잘있거라,1930
2,소나기,1940
0,춘향전,1600


In [56]:
songs_df.iloc[1:3, 0:2]

Unnamed: 0,book,year
3,해리포터,1990
5,알고리즘파이썬,2010


In [57]:
songs_df.iloc[1:3, :]

Unnamed: 0,book,year,pages
3,해리포터,1990,1000
5,알고리즘파이썬,2010,300


In [59]:
songs_df.iloc[1,1]

1990

### Missing Data 처리

- missing data 가 있는 row 혹은 columns 을 완전히 삭제 : dropna()
- 임의의 data 로 대체 : fillna()

In [44]:
import pandas as pd
import numpy as np

In [53]:
df = pd.DataFrame({'A': [1, 2, np.nan],
                   'B': [5, np.nan, np.nan],
                   'C': [1, 2, 3]})

In [46]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


### missing value 를 포함하고 있는 모든 row 삭제 - default

In [47]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


### missing value 를 포함하고 있는 모든 column 삭제

In [48]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


### missing value 대체

In [49]:
df.fillna(value=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3


In [50]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

### naive Python 과 Pandas 의 속도 비교

In [37]:
s = pd.Series(np.random.randint(0, 1000, 10000))
s.head()

0     31
1    602
2    353
3    590
4    952
dtype: int32

In [25]:
%%timeit -n 100
summary = 0
for item in s:
    summary += item

1.44 ms ± 152 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit -n 100
summary = np.sum(s)

113 µs ± 22.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### loop 보다 pandas 의 parallel 처리가 훨씬 빠르다

In [53]:
%%time
s = pd.Series(np.random.randint(0, 100, 10000))
for idx, val in s.iteritems():
    s.loc[idx] = val + 2

Wall time: 630 ms


In [54]:
%%time
s = pd.Series(np.random.randint(0, 100, 10000))
s += 2

Wall time: 994 µs
