# 26. Pandas Basic

## Pandas 는 Series data type 과 DataFrame data type 으로 구성된다.

### Series (1 차원) : numpy array 와 유사. 
- 차이점 - numpy 와 달리 Series 는 axis (행, 열)에 label 을 부여할 수 있다. 즉, numpy 와 같이 숫자로만 indexing 하는 것이 아니라 label 명으로 indexing 을 할 수 있다. 또한 숫자 뿐 아니라 임의의 Python object 를 모두 element 로 가질 수 있다.


### DataFrame (2차원, table)
- Python program 안의 Excel

### Series vs DataFrame
<img src="series-and-dataframe.width-1200.png" width="400">


<img src="base_01_pandas_5_0.png" width="400">

In [1]:
import pandas as pd
import numpy as np

## DataFrame 만들기

DataFrame 은 여러개의 Series 를 같은 index 기준으로 모아 Table 을 만든 것이다.

- List를 이용한 DataFrame 생성

In [2]:
np.random.seed(101)
data = np.random.randn(5, 4)

In [3]:
df = pd.DataFrame(data, index=['A', 'B', 'C', 'D', 'E'], 
                  columns=['W', 'X', 'Y', 'Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


- Dictionary를 이용하여 DataFrame 생성

In [4]:
songs = {"book": ["춘향전", "무기여 잘있거라", "소나기", "해리포터", 
                  "왕좌의 게임", "알고리즘파이썬"],
               "year": [1600, 1930, 1940, 1990, 2000, 2010],
               "pages": [50, 100, 100, 1000, 2000, 300]}

songs_df = pd.DataFrame(songs)
songs_df

Unnamed: 0,book,year,pages
0,춘향전,1600,50
1,무기여 잘있거라,1930,100
2,소나기,1940,100
3,해리포터,1990,1000
4,왕좌의 게임,2000,2000
5,알고리즘파이썬,2010,300


- page 가 많은 순서로 sort

In [5]:
songs_df.sort_values('pages', ascending=False, inplace=True)
songs_df

Unnamed: 0,book,year,pages
4,왕좌의 게임,2000,2000
3,해리포터,1990,1000
5,알고리즘파이썬,2010,300
1,무기여 잘있거라,1930,100
2,소나기,1940,100
0,춘향전,1600,50


### DataFrame indexing

- 단일 column을 이용한 indexing $\rightarrow$ Series 반환

In [6]:
songs_df['pages']

4    2000
3    1000
5     300
1     100
2     100
0      50
Name: pages, dtype: int64

In [7]:
type(songs_df['pages'])

pandas.core.series.Series

- 다중 column을 이용한 indexing $\rightarrow$ DataFrame 반환

In [8]:
y = songs_df[['book', 'year']]
y

Unnamed: 0,book,year
4,왕좌의 게임,2000
3,해리포터,1990
5,알고리즘파이썬,2010
1,무기여 잘있거라,1930
2,소나기,1940
0,춘향전,1600


## DataFrame의 slicing

- 순서 index를 이용한 slicing

In [9]:
df[:3]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001


- index 값을 이용한 slicing

In [10]:
df['A':'B']

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965


### loc - index value와 column 이름을 이용한 slicing

In [11]:
df.loc['B':'C', ['W', 'X']]

Unnamed: 0,W,X
B,0.651118,-0.319318
C,-2.018168,0.740122


### iloc - index 순서와 column 순서를 이용한 slicing

In [12]:
df.iloc[1:3, 0:2]

Unnamed: 0,W,X
B,0.651118,-0.319318
C,-2.018168,0.740122


In [13]:
df.iloc[1:3, :]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001


- 단일 value slicing

In [14]:
df.iloc[1,1]

-0.31931804459303326

### 조건에 따른 slicing 
$\rightarrow$ boolean indexing

In [15]:
[df['W'] > 0]

[A     True
 B     True
 C    False
 D     True
 E     True
 Name: W, dtype: bool]

In [16]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [17]:
[(df['W'] > 0) | (df['Y'] > 0)]

[A    True
 B    True
 C    True
 D    True
 E    True
 dtype: bool]

- or 조건

In [18]:
df[(df['W'] > 0) | (df['Y'] > 0)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


- and 조건

In [19]:
df[(df['W'] > 0) & (df['Y'] < 0)]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


### index values

In [20]:
df.index.values

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

### dataframe 의 column 명

In [21]:
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

### column 별 unique value 의 갯수

In [22]:
df.nunique()

W    5
X    5
Y    5
Z    5
dtype: int64

In [23]:
df['W'].unique()

array([ 2.70684984,  0.65111795, -2.01816824,  0.18869531,  0.19079432])

In [24]:
df['W'].value_counts()

 2.706850    1
 0.651118    1
-2.018168    1
 0.188695    1
 0.190794    1
Name: W, dtype: int64

- DataFrame 정보

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
dtypes: float64(4)
memory usage: 372.0+ bytes


- 기술 통계

In [26]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


### 새로운 column 추가/삭제

- column  추가

In [27]:
df['new'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


- column 삭제

In [28]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [29]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [30]:
df.drop('new', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


- 특정 index value 삭제

In [31]:
df.drop('D', axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


- DataFrame의 형상

In [32]:
df.shape

(4, 4)

- 임의의 column value 일부를 missing value 로 변경

In [33]:
df.loc['A':'B', ['Y']] = np.nan
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,,0.503826
B,0.651118,-0.319318,,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


### missing value 를 포함하고 있는 모든 row 삭제 - default

In [34]:
df.dropna()

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


### missing value 를 포함하고 있는 모든 column 삭제

In [35]:
df.dropna(axis=1)

Unnamed: 0,W,X,Z
A,2.70685,0.628133,0.503826
B,0.651118,-0.319318,0.605965
C,-2.018168,0.740122,-0.589001
E,0.190794,1.978757,0.683509


### missing value 대체

- missing value를 0 로 채우기

In [36]:
df.fillna(value=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.0,0.503826
B,0.651118,-0.319318,0.0,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


- missing value를 특정 column의 평균값으로 채우기

In [37]:
df.fillna(value=df['Y'].mean())

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,1.56739,0.503826
B,0.651118,-0.319318,1.56739,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


### csv file 읽기

In [38]:
df = pd.read_csv("winequality-red.csv", sep=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


### DataFrame 을 csv file 로 쓰기

In [39]:
df2 = df.loc[:, ['quality', 'alcohol', 'pH']]
df2.to_csv('winequality-2.csv')

In [40]:
pd.read_csv('winequality-2.csv', index_col=0).head()

Unnamed: 0,quality,alcohol,pH
0,5,9.4,3.51
1,5,9.8,3.2
2,5,9.8,3.26
3,6,9.8,3.16
4,5,9.4,3.51


### DataFrame 에 함수 적용 - df.apply
#### df.apply + lambda 함수

In [41]:
df.apply(lambda x :  x["fixed acidity"] + x["citric acid"] , axis = 1 )

0        7.40
1        7.80
2        7.84
3       11.76
4        7.40
        ...  
1594     6.28
1595     6.00
1596     6.43
1597     6.02
1598     6.47
Length: 1599, dtype: float64

In [42]:
df["New_val"] = df.apply(lambda x :  x["fixed acidity"] + x["citric acid"] , axis = 1 )
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,New_val
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,7.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,7.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,7.84
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,11.76
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,7.4


#### lambda 보다 더 복잡한 내용의 함수 적용

In [43]:
def custom(x) :
    if x['alcohol'] < 10 :
        return x['pH'] * 1.5
    else :
        return x['pH'] * -1

df["New_pH"] = df.apply(custom, axis = 1 )
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,New_val,New_pH
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,7.4,5.265
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,7.8,4.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,7.84,4.89
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,11.76,4.74
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,7.4,5.265


### read_sql을 이용한 SQL DB access

In [44]:
import sqlite3

con = sqlite3.connect('emaildb.sqlite')
cur = con.cursor()

In [45]:
df = pd.read_sql('select * from Counts', con=con)
df.head()

Unnamed: 0,email,count
0,stephen.marquard@uct.ac.za,2
1,louis@media.berkeley.edu,3
2,zqian@umich.edu,4
3,rjlowe@iupui.edu,2
4,cwen@iupui.edu,5


In [46]:
df = pd.read_sql('SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10', con=con)
df.tail()

Unnamed: 0,email,count
5,stephen.marquard@uct.ac.za,2
6,rjlowe@iupui.edu,2
7,wagnermr@iupui.edu,1
8,antranig@caret.cam.ac.uk,1
9,gopal.ramasammycook@gmail.com,1
