# pandas

- 구조화된 데이터 처리를 지원하는 Python 라이브러리
- 고성능 array 계산 라이브러리 numpy와 통합하여 강력한 속도 및 처리 기능 제공
- 데이터 처리 및 통계 분석을 위해 사용

- 데이터 전체: Data Table, Sample
- 상단: attribute, field, feature, column
- 데이터 record: instance, tuple, row
- 하나의 column의 집합: feature vector
- 값 하나: data, value

In [1]:
import pandas as pd

In [4]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data

--2024-07-04 21:13:25--  https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘housing.data’

housing.data            [  <=>               ]  47.93K   164KB/s    in 0.3s    

2024-07-04 21:13:27 (164 KB/s) - ‘housing.data’ saved [49082]



In [5]:
df_data = pd.read_csv("housing.data", sep="\s+", header=None)

In [6]:
df_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


## Series
- Dataframe 중 하나의 Column에 해당하는 데이터의 모음 Object
- 기존 데이터를 불러와서 DataFrame 생성

In [11]:
from pandas import Series

example_obj = Series(data=[1, 2, 3, 4, 5], index=["a", "a", "c", "d", "e"])
example_obj

a    1
a    2
c    3
d    4
e    5
dtype: int64

In [13]:
example_obj.index

Index(['a', 'a', 'c', 'd', 'e'], dtype='object')

In [14]:
example_obj.values

array([1, 2, 3, 4, 5])

In [15]:
example_obj["a"] = 10

In [18]:
example_obj = example_obj.astype(float)
example_obj

a    10.0
a    10.0
c     3.0
d     4.0
e     5.0
dtype: float64

In [19]:
dict_data_1 = {"a":1, "b":2, "c":3, "d":4, "e":5}
indexes = ["a","b","c","d","e","f","g"]

example_obj_2 = Series(dict_data_1, index=indexes)

In [20]:
example_obj_2

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
dtype: float64

# DataFrame

- column이 지정됨
- 각 column별로 data type은 다를 수 있음
- Series를 모아서 만든 Data Table = 2차원

In [22]:
raw_data = {
    "first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"],
    "last_name": ["Miller", "Jacobson", "Ali", "Milner", "Cooze"],
    "age": [42, 52, 36, 24, 73],
    "city": ["San Francisco", "Baltimore", "Miami", "Douglas", "Boston"]
}

In [25]:
df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "age", "city", "debt"])
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [26]:
df.first_name

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [27]:
df["first_name"]

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [28]:
df.loc[1]

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [29]:
df["age"].iloc[1]

52

In [32]:
import numpy as np

s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [33]:
s.loc[1:]

1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
dtype: float64

In [34]:
s.iloc[1:]

48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [36]:
df.age > 40

0     True
1     True
2    False
3    False
4     True
Name: age, dtype: bool

In [37]:
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [38]:
df.T

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True


In [39]:
df.values

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [40]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [41]:
del df["debt"]

In [42]:
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [44]:
df.drop("age", axis=1)

Unnamed: 0,first_name,last_name,city
0,Jason,Miller,San Francisco
1,Molly,Jacobson,Baltimore
2,Tina,Ali,Miami
3,Jake,Milner,Douglas
4,Amy,Cooze,Boston


## Selection & Drop

In [46]:
df["age"].head(3)

0    42
1    52
2    36
Name: age, dtype: int64

In [47]:
df[["first_name", "age"]].head(3)

Unnamed: 0,first_name,age
0,Jason,42
1,Molly,52
2,Tina,36


In [49]:
df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "age", "city", "debt"])
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [51]:
df[:3]

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,


In [53]:
df["first_name"][[0,1,2]]

0    Jason
1    Molly
2     Tina
Name: first_name, dtype: object

In [56]:
df["first_name"][[1, 3, 3, 2]]

1    Molly
3     Jake
3     Jake
2     Tina
Name: first_name, dtype: object

In [57]:
df[df["age"] > 50]

Unnamed: 0,first_name,last_name,age,city,debt
1,Molly,Jacobson,52,Baltimore,
4,Amy,Cooze,73,Boston,


## basic, loc, iloc

In [58]:
df[["first_name", "last_name"]][:2]

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Molly,Jacobson


In [60]:
df.loc[[0,1], ["first_name", "last_name"]]

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Molly,Jacobson


In [62]:
df.iloc[:2, :1]

Unnamed: 0,first_name
0,Jason
1,Molly


In [63]:
df.reset_index()

Unnamed: 0,index,first_name,last_name,age,city,debt
0,0,Jason,Miller,42,San Francisco,
1,1,Molly,Jacobson,52,Baltimore,
2,2,Tina,Ali,36,Miami,
3,3,Jake,Milner,24,Douglas,
4,4,Amy,Cooze,73,Boston,


In [64]:
df.drop(1)

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [65]:
df.drop([1, 3])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
2,Tina,Ali,36,Miami,
4,Amy,Cooze,73,Boston,


- df 자체에 값을 바로 적용시키려면 `inplace=True`를 적용해야 함

# DataFrame Operations

## Series operation

In [66]:
s1 = Series(range(1, 6), index=list("abced"))
s2 = Series(range(5, 11), index=list("bcedef"))

s1.add(s2)

a     NaN
b     7.0
c     9.0
d    13.0
e    11.0
e    13.0
f     NaN
dtype: float64

In [67]:
d1 = pd.DataFrame(np.arange(9).reshape(3,3), columns=list("abc"))
d2 = pd.DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))

In [68]:
d1 + d2

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,
1,7.0,9.0,11.0,
2,14.0,16.0,18.0,
3,,,,


In [69]:
d1.add(d2, fill_value=0)

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,3.0
1,7.0,9.0,11.0,7.0
2,14.0,16.0,18.0,11.0
3,12.0,13.0,14.0,15.0


In [70]:
s = Series(np.arange(10, 14))
d2+s

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [74]:
d2.add(s, axis=0)

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,15,16,17,18
2,20,21,22,23
3,25,26,27,28


## lambda, map, apply

In [75]:
s1 = Series(np.arange(10))
s1.map(lambda x: x**2)

0     0
1     1
2     4
3     9
4    16
5    25
6    36
7    49
8    64
9    81
dtype: int64

- map: dict 형태로 각 값에 mapping 가능
- replace: mapping 중 데이터 변환만 담당

In [77]:
def f(x):
    return x ** 2

s1.map(f)

0     0
1     1
2     4
3     9
4    16
5    25
6    36
7    49
8    64
9    81
dtype: int64

In [78]:
x = {1: "A", 2: "B"}

s1.map(x)

0    NaN
1      A
2      B
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [79]:
s1.replace(x)

0    0
1    A
2    B
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: object

In [81]:
df_info = df[["first_name", "last_name"]]

In [82]:
df_info

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Molly,Jacobson
2,Tina,Ali
3,Jake,Milner
4,Amy,Cooze


In [84]:
f = lambda x: x.lower()
df_info["first_name"].apply(f)

0    jason
1    molly
2     tina
3     jake
4      amy
Name: first_name, dtype: object

# pandas built-in functions

In [87]:
df.describe()

Unnamed: 0,age
count,5.0
mean,45.4
std,18.460769
min,24.0
25%,36.0
50%,42.0
75%,52.0
max,73.0


In [90]:
df['age'].unique()

array([42, 52, 36, 24, 73])

In [91]:
df['age'].sum()

227

In [93]:
df.isnull().sum()

first_name    0
last_name     0
age           0
city          0
debt          5
dtype: int64

In [94]:
df.sort_values(["age", "city"])

Unnamed: 0,first_name,last_name,age,city,debt
3,Jake,Milner,24,Douglas,
2,Tina,Ali,36,Miami,
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
4,Amy,Cooze,73,Boston,


In [96]:
df["age"].corr(df["age"])

1.0

In [97]:
df["age"].cov(df["age"])

340.8