# pandas
- '엑셀'처럼 표 모양
- https://pandas.pydata.org/
- 파이썬에서 사용하는 데이터 분석 라이브러리
- 행과 열로 이루어진 데이터 객체를 만들어 다룰 수 있게 되며 보다 안정적으로 대용량의 데이터들을 처리하는데 매우 편리
---

### Series

In [1]:
# Series 하나의 열에 대하여 처리

import pandas as pd

obj = pd.Series([2, 4, 6, 8, 10])
print(obj)

0     2
1     4
2     6
3     8
4    10
dtype: int64


In [3]:
print(obj.values)
print(type(obj.values))

print(obj.index)
print(type(obj.index))

print(obj.dtype)
print(type(obj.dtype))

[ 2  4  6  8 10]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>
int64
<class 'numpy.dtypes.Int64DType'>


In [6]:
obj = pd.Series([1, 3, 5, 7, 9], index=['a', 'b', 'c', 'd', 'e'])
print(obj)

a    1
b    3
c    5
d    7
e    9
dtype: int64


In [7]:
dic_data = {"x": 100, "y":200, "z": 300}
obj = pd.Series(dic_data)
print(obj)

x    100
y    200
z    300
dtype: int64


In [9]:
obj.index = ["Q", "W", "E"]
print(obj)

Q    100
W    200
E    300
dtype: int64


In [10]:
# .을 통해 하위로 이동하는 것
obj.index.name = "idx"
obj.name = "my_data"
print(obj)

idx
Q    100
W    200
E    300
Name: my_data, dtype: int64


### Data Frame 

In [14]:
# 행열, Table 모양으로 데이터 처리
# 엑셀과 비슷한..

data = {
    "name": ["A", "B", "C", "D"],
    "age": [20, 27, 35, 40],
    "blood": ["b", "a", "o", "ab"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,blood
0,A,20,b
1,B,27,a
2,C,35,o
3,D,40,ab


In [15]:
print(df.index)
print(df.columns)
print(df.values)
print(type(df.values))

RangeIndex(start=0, stop=4, step=1)
Index(['name', 'age', 'blood'], dtype='object')
[['A' 20 'b']
 ['B' 27 'a']
 ['C' 35 'o']
 ['D' 40 'ab']]
<class 'numpy.ndarray'>


In [17]:
df.index.name = "No."
df.columns.name = "Info"
df

Info,name,age,blood
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,A,20,b
1,B,27,a
2,C,35,o
3,D,40,ab


In [18]:
df = pd.DataFrame(data, columns=["name", "age", "blood", "MBTI"],
                  index=[1, 2, 3, 4])
print(df)

  name  age blood MBTI
1    A   20     b  NaN
2    B   27     a  NaN
3    C   35     o  NaN
4    D   40    ab  NaN


In [21]:
print(df.describe()) # 숫자가 나이밖에 없으니까~~

             age
count   4.000000
mean   30.500000
std     8.812869
min    20.000000
25%    25.250000
50%    31.000000
75%    36.250000
max    40.000000


In [26]:
print(df["name"])
print(df.name)

1    A
2    B
3    C
4    D
Name: name, dtype: object
1    A
2    B
3    C
4    D
Name: name, dtype: object


In [24]:
print(df[["name", "MBTI"]])

  name MBTI
1    A  NaN
2    B  NaN
3    C  NaN
4    D  NaN


In [27]:
df["point"] = 0
print(df)

  name  age blood MBTI  point
1    A   20     b  NaN      0
2    B   27     a  NaN      0
3    C   35     o  NaN      0
4    D   40    ab  NaN      0


In [29]:
df["point"] = [100, 200, 300, 0]
print(df)

  name  age blood MBTI  point
1    A   20     b  NaN    100
2    B   27     a  NaN    200
3    C   35     o  NaN    300
4    D   40    ab  NaN      0


In [30]:
import numpy as np

df["np_idx"] = np.arange(4)
print(df)

  name  age blood MBTI  point  np_idx
1    A   20     b  NaN    100       0
2    B   27     a  NaN    200       1
3    C   35     o  NaN    300       2
4    D   40    ab  NaN      0       3


In [34]:
# 인덱스로 조인
# 데이터 값보다 적게 값과 인덱스를 넣어줬더니...
val = pd.Series([-1.2, -1.5, -1.7], index=[2, 3, 4])
df["minus"] = val
print(df)

  name  age blood MBTI  point  np_idx  minus
1    A   20     b  NaN    100       0    NaN
2    B   27     a  NaN    200       1   -1.2
3    C   35     o  NaN    300       2   -1.5
4    D   40    ab  NaN      0       3   -1.7


In [36]:
df["np_idx"] = df["age"]
print(df)

  name  age blood MBTI  point  np_idx  minus
1    A   20     b  NaN    100      20    NaN
2    B   27     a  NaN    200      27   -1.2
3    C   35     o  NaN    300      35   -1.5
4    D   40    ab  NaN      0      40   -1.7


In [37]:
df["bool_test"] = df["age"] % 2 == 0
print(df)

  name  age blood MBTI  point  np_idx  minus  bool_test
1    A   20     b  NaN    100      20    NaN       True
2    B   27     a  NaN    200      27   -1.2      False
3    C   35     o  NaN    300      35   -1.5      False
4    D   40    ab  NaN      0      40   -1.7       True


In [39]:
# df 슬라이싱..! 행으로 자름
print(df[0:2])

  name  age blood MBTI  point  np_idx  minus  bool_test
1    A   20     b  NaN    100      20    NaN       True
2    B   27     a  NaN    200      27   -1.2      False


In [41]:
df.index = ["one", "two", "three", "four"]
print(df)
print(df["two":"three"]) # 얘는 또 포함..ㅋㅋ..

      name  age blood MBTI  point  np_idx  minus  bool_test
one      A   20     b  NaN    100      20    NaN       True
two      B   27     a  NaN    200      27   -1.2      False
three    C   35     o  NaN    300      35   -1.5      False
four     D   40    ab  NaN      0      40   -1.7       True
      name  age blood MBTI  point  np_idx  minus  bool_test
two      B   27     a  NaN    200      27   -1.2      False
three    C   35     o  NaN    300      35   -1.5      False


In [42]:
print(df.loc["two"])
print("="*20)
print(df.loc["two":"three"])

name             B
age             27
blood            a
MBTI           NaN
point          200
np_idx          27
minus         -1.2
bool_test    False
Name: two, dtype: object
      name  age blood MBTI  point  np_idx  minus  bool_test
two      B   27     a  NaN    200      27   -1.2      False
three    C   35     o  NaN    300      35   -1.5      False


In [44]:
# loc["idx", "col"]
print(df.loc["two":"three", "point"])
print("="*20)
print(df.loc[:, "name":"blood"])

two      200
three    300
Name: point, dtype: int64
      name  age blood
one      A   20     b
two      B   27     a
three    C   35     o
four     D   40    ab


In [45]:
del df["np_idx"]
print(df)

      name  age blood MBTI  point  minus  bool_test
one      A   20     b  NaN    100    NaN       True
two      B   27     a  NaN    200   -1.2      False
three    C   35     o  NaN    300   -1.5      False
four     D   40    ab  NaN      0   -1.7       True


In [47]:
# 새로 삽입
df.loc["five", :] = ["E", 30, 'ab', 'INTJ', 0, -1, False]
print(df)

      name   age blood  MBTI  point  minus bool_test
one      A  20.0     b   NaN  100.0    NaN      True
two      B  27.0     a   NaN  200.0   -1.2     False
three    C  35.0     o   NaN  300.0   -1.5     False
four     D  40.0    ab   NaN    0.0   -1.7      True
five     E  30.0    ab  INTJ    0.0   -1.0     False


In [48]:
# iloc (idx location)

print(df.iloc[1])
print("=" * 20)
print(df.iloc[1:2])

name             B
age           27.0
blood            a
MBTI           NaN
point        200.0
minus         -1.2
bool_test    False
Name: two, dtype: object
    name   age blood MBTI  point  minus bool_test
two    B  27.0     a  NaN  200.0   -1.2     False


In [51]:
print(df.iloc[0:2, 0:2])
print(df.iloc[[0, 1, 3], [0, 3]])
print(df.iloc[:, 1:4])

    name   age
one    A  20.0
two    B  27.0
     name MBTI
one     A  NaN
two     B  NaN
four    D  NaN
        age blood  MBTI
one    20.0     b   NaN
two    27.0     a   NaN
three  35.0     o   NaN
four   40.0    ab   NaN
five   30.0    ab  INTJ


In [54]:
print(df["age"] < 22)

one       True
two      False
three    False
four     False
five     False
Name: age, dtype: bool


In [55]:
#            key가 True였던 애들을 loc에 집어넣기
print(df.loc[df["age"] < 22, :])

    name   age blood MBTI  point  minus bool_test
one    A  20.0     b  NaN  100.0    NaN      True


In [60]:
print(df.loc[df["name"] == "A", ["name", "age"]])
print("=" * 20)
# | or 합집합
# & and 교집합
print(df.loc[(df["name"] == "A") | (df["name"] == "B"), ["name", "age"]])

    name   age
one    A  20.0
    name   age
one    A  20.0
two    B  27.0


In [61]:
df.loc[df["point"] == 0, "point"] = 10000
print(df)

      name   age blood  MBTI    point  minus bool_test
one      A  20.0     b   NaN    100.0    NaN      True
two      B  27.0     a   NaN    200.0   -1.2     False
three    C  35.0     o   NaN    300.0   -1.5     False
four     D  40.0    ab   NaN  10000.0   -1.7      True
five     E  30.0    ab  INTJ  10000.0   -1.0     False


### data

In [63]:
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,0.432102,-1.341641,-0.827634,-0.130513
1,-0.757702,-2.365341,0.104112,-0.122465
2,-0.719475,-1.499598,-0.315091,-0.562294
3,-1.932665,-0.446101,-0.035566,-0.229672
4,1.344641,0.33748,1.108901,-0.124109
5,-0.600142,-1.735893,-0.132964,-0.367987


In [65]:
# 날짜!
df.columns = ["A", "B", "C", "D"]
df.index = pd.date_range("20260101", periods=6)
print(df.index)
print("=" * 100)
print(df)

DatetimeIndex(['2026-01-01', '2026-01-02', '2026-01-03', '2026-01-04',
               '2026-01-05', '2026-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672
2026-01-05  1.344641  0.337480  1.108901 -0.124109
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987


In [67]:
# NaN 값을 명시적으로 넣어주는 것도 가능!
df["F"] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2026-01-01,0.432102,-1.341641,-0.827634,-0.130513,1.0
2026-01-02,-0.757702,-2.365341,0.104112,-0.122465,
2026-01-03,-0.719475,-1.499598,-0.315091,-0.562294,3.5
2026-01-04,-1.932665,-0.446101,-0.035566,-0.229672,6.1
2026-01-05,1.344641,0.33748,1.108901,-0.124109,
2026-01-06,-0.600142,-1.735893,-0.132964,-0.367987,7.0


In [68]:
# 비어있는 값을 처리해주자!!
# 하나라도 na 있으면 행을 다 버려버려
# 원본은 그대로 유지~~~

print(df.dropna(how="any"))
print("=" * 100)
print(df)

                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0
                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  NaN
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0


In [69]:
# 행 전체가 NaN이면 버려
print(df.dropna(how="all"))
print("=" * 100)
print(df)

                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  NaN
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0
                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  NaN
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0


In [70]:
# 채워
print(df.fillna(value=0.5))

                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  0.5
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  0.5
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0


In [71]:
print(df.isnull())

                A      B      C      D      F
2026-01-01  False  False  False  False  False
2026-01-02  False  False  False  False   True
2026-01-03  False  False  False  False  False
2026-01-04  False  False  False  False  False
2026-01-05  False  False  False  False   True
2026-01-06  False  False  False  False  False


In [73]:
# 엥???? 이거 좀 헷갈림 다시 보기
print(df.loc[df.isnull()["F"], :])

                   A         B         C         D   F
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465 NaN
2026-01-05  1.344641  0.337480  1.108901 -0.124109 NaN


In [74]:
pd.to_datetime("20260102")

Timestamp('2026-01-02 00:00:00')

In [75]:
print(df.drop(pd.to_datetime("20260102")))
print("=" * 100)
df.drop([pd.to_datetime("20260102"), pd.to_datetime("20260104")])

                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0


Unnamed: 0,A,B,C,D,F
2026-01-01,0.432102,-1.341641,-0.827634,-0.130513,1.0
2026-01-03,-0.719475,-1.499598,-0.315091,-0.562294,3.5
2026-01-05,1.344641,0.33748,1.108901,-0.124109,
2026-01-06,-0.600142,-1.735893,-0.132964,-0.367987,7.0


In [76]:
print(df.drop("F", axis=1))
print(df)

                   A         B         C         D
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672
2026-01-05  1.344641  0.337480  1.108901 -0.124109
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987
                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  NaN
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0


In [77]:
print(df.drop("20260101", axis=0))
print(df)

                   A         B         C         D    F
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  NaN
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0
                   A         B         C         D    F
2026-01-01  0.432102 -1.341641 -0.827634 -0.130513  1.0
2026-01-02 -0.757702 -2.365341  0.104112 -0.122465  NaN
2026-01-03 -0.719475 -1.499598 -0.315091 -0.562294  3.5
2026-01-04 -1.932665 -0.446101 -0.035566 -0.229672  6.1
2026-01-05  1.344641  0.337480  1.108901 -0.124109  NaN
2026-01-06 -0.600142 -1.735893 -0.132964 -0.367987  7.0


In [None]:
print(df.drop(["A", "F"], axis=1))
print("=" * 100)
print(df)

### 함수

In [78]:
data = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
df = pd.DataFrame(data, columns=["one", "two"], index=["a", "b", "c", "d"])
print(df)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


In [79]:
print(df.head(2))

   one  two
a  1.4  NaN
b  7.1 -4.5


In [80]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     2 non-null      float64
dtypes: float64(2)
memory usage: 96.0+ bytes
None


In [None]:
print(df.sum(axis=0))
print(df.sum(axis=1))
# na에 대해 스킵하지 말고 해봐
print(df.sum(axis=1, skipna=False))

one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64


In [84]:
df = pd.DataFrame(np.random.randn(6, 4),
                  columns=["A", "B", "C", "D"],
                  index=pd.date_range("20260101", periods=6))

print(df)
print("=" * 100)
dates = df.index
random_dates = np.random.permutation(dates)
df = df.reindex(index=random_dates, columns=["D", "B", "C", "A"])
print(df)

                   A         B         C         D
2026-01-01  0.740978  0.164794 -0.712688 -2.070679
2026-01-02  0.342140 -0.376441  1.893490 -1.864039
2026-01-03  0.456616  1.374367 -0.240985 -0.524825
2026-01-04 -0.295223  0.112463  0.893490  0.881018
2026-01-05 -0.389622 -0.271304 -0.173253 -1.441069
2026-01-06  0.135871  0.352796  1.025553 -1.939911
                   D         B         C         A
2026-01-02 -1.864039 -0.376441  1.893490  0.342140
2026-01-03 -0.524825  1.374367 -0.240985  0.456616
2026-01-05 -1.441069 -0.271304 -0.173253 -0.389622
2026-01-06 -1.939911  0.352796  1.025553  0.135871
2026-01-04  0.881018  0.112463  0.893490 -0.295223
2026-01-01 -2.070679  0.164794 -0.712688  0.740978


In [None]:
print(df.sort_index(axis=0))
print("=" * 100)
print(df.sort_index(axis=1))
print("="*100)
print(df.sort_index(axis=1, ascending=False))

# 2개의 축에 대해 정렬하기!! 두 번 돌리면 되징
print(df.sort_index(axis=0).sort_index(axis=0))

                   D         B         C         A
2026-01-01 -2.070679  0.164794 -0.712688  0.740978
2026-01-02 -1.864039 -0.376441  1.893490  0.342140
2026-01-03 -0.524825  1.374367 -0.240985  0.456616
2026-01-04  0.881018  0.112463  0.893490 -0.295223
2026-01-05 -1.441069 -0.271304 -0.173253 -0.389622
2026-01-06 -1.939911  0.352796  1.025553  0.135871
                   A         B         C         D
2026-01-02  0.342140 -0.376441  1.893490 -1.864039
2026-01-03  0.456616  1.374367 -0.240985 -0.524825
2026-01-05 -0.389622 -0.271304 -0.173253 -1.441069
2026-01-06  0.135871  0.352796  1.025553 -1.939911
2026-01-04 -0.295223  0.112463  0.893490  0.881018
2026-01-01  0.740978  0.164794 -0.712688 -2.070679
                   D         C         B         A
2026-01-02 -1.864039  1.893490 -0.376441  0.342140
2026-01-03 -0.524825 -0.240985  1.374367  0.456616
2026-01-05 -1.441069 -0.173253 -0.271304 -0.389622
2026-01-06 -1.939911  1.025553  0.352796  0.135871
2026-01-04  0.881018  0.893490 

In [87]:
print(df.sort_values(by="D"))

                   D         B         C         A
2026-01-01 -2.070679  0.164794 -0.712688  0.740978
2026-01-06 -1.939911  0.352796  1.025553  0.135871
2026-01-02 -1.864039 -0.376441  1.893490  0.342140
2026-01-05 -1.441069 -0.271304 -0.173253 -0.389622
2026-01-03 -0.524825  1.374367 -0.240985  0.456616
2026-01-04  0.881018  0.112463  0.893490 -0.295223


In [None]:
df["E"] = np.random.randint(0, 6, size=6)
df["F"] = ["alpha", "beta", "gamma", "gamma", "alpha", "gamma"]
print(df)
print("=" * 100)
print(df.sort_values(by=['E', 'F']))