# Object creation (2024-01-26)

In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150375,0.282225,0.813313,0.95463
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-03,-0.106463,1.508199,-1.051491,1.17193
2013-01-04,2.734634,0.085089,-0.753946,-0.585387
2013-01-05,-2.141621,0.882776,-0.236388,0.443722
2013-01-06,-1.61553,0.989847,0.581488,-0.254552


In [6]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1,index=list(range(4)),dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", #특이한게 ,로 끝내도 되고 아닌 것으로 끝내도 되네?
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
#df2.<TAB> # noqa: E225, E999

#df2.까지 입력후 탭누르라는 의미, vscode에서는 탭안눌러도 자동으로 표시 단 컬럼은 안 표시됨

# Viewing data (2024-01-27)

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.150375,0.282225,0.813313,0.95463
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-03,-0.106463,1.508199,-1.051491,1.17193
2013-01-04,2.734634,0.085089,-0.753946,-0.585387
2013-01-05,-2.141621,0.882776,-0.236388,0.443722


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,2.734634,0.085089,-0.753946,-0.585387
2013-01-05,-2.141621,0.882776,-0.236388,0.443722
2013-01-06,-1.61553,0.989847,0.581488,-0.254552


In [11]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
df.to_numpy()

array([[ 0.15037501,  0.28222547,  0.8133131 ,  0.95463029],
       [ 0.44225336, -1.20613143,  1.17816304, -0.72806165],
       [-0.10646277,  1.50819946, -1.05149137,  1.17192997],
       [ 2.73463412,  0.08508892, -0.75394638, -0.58538735],
       [-2.14162089,  0.88277645, -0.23638817,  0.44372198],
       [-1.61553049,  0.98984699,  0.58148758, -0.25455238]])

In [14]:
df.values

array([[ 0.15037501,  0.28222547,  0.8133131 ,  0.95463029],
       [ 0.44225336, -1.20613143,  1.17816304, -0.72806165],
       [-0.10646277,  1.50819946, -1.05149137,  1.17192997],
       [ 2.73463412,  0.08508892, -0.75394638, -0.58538735],
       [-2.14162089,  0.88277645, -0.23638817,  0.44372198],
       [-1.61553049,  0.98984699,  0.58148758, -0.25455238]])

In [15]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [16]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [18]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.089392,0.423668,0.088523,0.167047
std,1.723734,0.948518,0.902287,0.806443
min,-2.141621,-1.206131,-1.051491,-0.728062
25%,-1.238264,0.134373,-0.624557,-0.502679
50%,0.021956,0.582501,0.17255,0.094585
75%,0.369284,0.963079,0.755357,0.826903
max,2.734634,1.508199,1.178163,1.17193


In [19]:
df.T
#행과 열을 바꿔주는데 미리 계산되어 있음
#메소드가 아닌 속성임

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.150375,0.442253,-0.106463,2.734634,-2.141621,-1.61553
B,0.282225,-1.206131,1.508199,0.085089,0.882776,0.989847
C,0.813313,1.178163,-1.051491,-0.753946,-0.236388,0.581488
D,0.95463,-0.728062,1.17193,-0.585387,0.443722,-0.254552


In [20]:
df.T.index
#값만 바꿔서 나타내는가해서 확인용으로 해줬는데 인덱스도 확실히 바뀌어있다

Index(['A', 'B', 'C', 'D'], dtype='object')

In [21]:
df.sort_index(axis=1, ascending=False)
#axis = 1 하면 열끼리 정렬하는 것

Unnamed: 0,D,C,B,A
2013-01-01,0.95463,0.813313,0.282225,0.150375
2013-01-02,-0.728062,1.178163,-1.206131,0.442253
2013-01-03,1.17193,-1.051491,1.508199,-0.106463
2013-01-04,-0.585387,-0.753946,0.085089,2.734634
2013-01-05,0.443722,-0.236388,0.882776,-2.141621
2013-01-06,-0.254552,0.581488,0.989847,-1.61553


In [22]:
df.sort_values(by="B")
#df안에 있는 특정 열의 값에 대해서 정렬해줄 수 있음

Unnamed: 0,A,B,C,D
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-04,2.734634,0.085089,-0.753946,-0.585387
2013-01-01,0.150375,0.282225,0.813313,0.95463
2013-01-05,-2.141621,0.882776,-0.236388,0.443722
2013-01-06,-1.61553,0.989847,0.581488,-0.254552
2013-01-03,-0.106463,1.508199,-1.051491,1.17193


# Selection (2024-01-28)

## Getitem ([])

In [23]:
df["A"]

2013-01-01    0.150375
2013-01-02    0.442253
2013-01-03   -0.106463
2013-01-04    2.734634
2013-01-05   -2.141621
2013-01-06   -1.615530
Freq: D, Name: A, dtype: float64

In [24]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.150375,0.282225,0.813313,0.95463
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-03,-0.106463,1.508199,-1.051491,1.17193


In [25]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-03,-0.106463,1.508199,-1.051491,1.17193
2013-01-04,2.734634,0.085089,-0.753946,-0.585387


## Selection by label

In [26]:
df.loc[dates[0]]

A    0.150375
B    0.282225
C    0.813313
D    0.954630
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[dates[0],:]

A    0.150375
B    0.282225
C    0.813313
D    0.954630
Name: 2013-01-01 00:00:00, dtype: float64

In [28]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,0.150375,0.282225
2013-01-02,0.442253,-1.206131
2013-01-03,-0.106463,1.508199
2013-01-04,2.734634,0.085089
2013-01-05,-2.141621,0.882776
2013-01-06,-1.61553,0.989847


In [29]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,0.442253,-1.206131
2013-01-03,-0.106463,1.508199
2013-01-04,2.734634,0.085089


In [30]:
df.loc["20130102":"20130104",["B","A"]]

Unnamed: 0,B,A
2013-01-02,-1.206131,0.442253
2013-01-03,1.508199,-0.106463
2013-01-04,0.085089,2.734634


In [31]:
df.loc[dates[0],"A"]

0.1503750063476736

In [32]:
df.at[dates[0],"A"]

0.1503750063476736

## Selection by position

In [33]:
df.iloc[3]

A    2.734634
B    0.085089
C   -0.753946
D   -0.585387
Name: 2013-01-04 00:00:00, dtype: float64

In [34]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,2.734634,0.085089
2013-01-05,-2.141621,0.882776


In [35]:
#비교 loc는 슬라이싱 할때 끝부분까지로 인식(끝부분 포함됨) iloc는 끝부분제외
#df.loc[3:5, 0:2] 아 맞다 이런 식으로 숫자 인덱스로 안됬다
df.loc["20130104":"20130105", "A":"B"]

Unnamed: 0,A,B
2013-01-04,2.734634,0.085089
2013-01-05,-2.141621,0.882776


In [36]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.442253,1.178163
2013-01-03,-0.106463,-1.051491
2013-01-05,-2.141621,-0.236388


In [37]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-03,-0.106463,1.508199,-1.051491,1.17193


In [38]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.282225,0.813313
2013-01-02,-1.206131,1.178163
2013-01-03,1.508199,-1.051491
2013-01-04,0.085089,-0.753946
2013-01-05,0.882776,-0.236388
2013-01-06,0.989847,0.581488


In [39]:
df.iloc[1,1]

-1.2061314273940253

In [40]:
df.iat[1,1]

-1.2061314273940253

## Boolean indexing (조건에 따른 컬럼 선택)

In [41]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.150375,0.282225,0.813313,0.95463
2013-01-02,0.442253,-1.206131,1.178163,-0.728062
2013-01-04,2.734634,0.085089,-0.753946,-0.585387


In [42]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.150375,0.282225,0.813313,0.95463
2013-01-02,0.442253,,1.178163,
2013-01-03,,1.508199,,1.17193
2013-01-04,2.734634,0.085089,,
2013-01-05,,0.882776,,0.443722
2013-01-06,,0.989847,0.581488,


In [43]:
df2=df.copy()
df2["E"]=["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.150375,0.282225,0.813313,0.95463,one
2013-01-02,0.442253,-1.206131,1.178163,-0.728062,one
2013-01-03,-0.106463,1.508199,-1.051491,1.17193,two
2013-01-04,2.734634,0.085089,-0.753946,-0.585387,three
2013-01-05,-2.141621,0.882776,-0.236388,0.443722,four
2013-01-06,-1.61553,0.989847,0.581488,-0.254552,three


In [44]:
df2[df2["E"].isin(["two", "four"])]
#isin()안에 []로 넣어야하는 듯 (한 개의 경우에도)

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.106463,1.508199,-1.051491,1.17193,two
2013-01-05,-2.141621,0.882776,-0.236388,0.443722,four


## Setting

In [45]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [46]:
df["F"]=s1

In [47]:
df.at[dates[0],"A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.282225,0.813313,0.95463,
2013-01-02,0.442253,-1.206131,1.178163,-0.728062,1.0
2013-01-03,-0.106463,1.508199,-1.051491,1.17193,2.0
2013-01-04,2.734634,0.085089,-0.753946,-0.585387,3.0
2013-01-05,-2.141621,0.882776,-0.236388,0.443722,4.0
2013-01-06,-1.61553,0.989847,0.581488,-0.254552,5.0


In [48]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.813313,0.95463,
2013-01-02,0.442253,-1.206131,1.178163,-0.728062,1.0
2013-01-03,-0.106463,1.508199,-1.051491,1.17193,2.0
2013-01-04,2.734634,0.085089,-0.753946,-0.585387,3.0
2013-01-05,-2.141621,0.882776,-0.236388,0.443722,4.0
2013-01-06,-1.61553,0.989847,0.581488,-0.254552,5.0


In [49]:
len(df)

6

In [50]:
df.loc[:, "D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.813313,5.0,
2013-01-02,0.442253,-1.206131,1.178163,5.0,1.0
2013-01-03,-0.106463,1.508199,-1.051491,5.0,2.0
2013-01-04,2.734634,0.085089,-0.753946,5.0,3.0
2013-01-05,-2.141621,0.882776,-0.236388,5.0,4.0
2013-01-06,-1.61553,0.989847,0.581488,5.0,5.0


In [51]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.813313,-5.0,
2013-01-02,-0.442253,-1.206131,-1.178163,-5.0,-1.0
2013-01-03,-0.106463,-1.508199,-1.051491,-5.0,-2.0
2013-01-04,-2.734634,-0.085089,-0.753946,-5.0,-3.0
2013-01-05,-2.141621,-0.882776,-0.236388,-5.0,-4.0
2013-01-06,-1.61553,-0.989847,-0.581488,-5.0,-5.0


# Missing data (2024-01-29)

In [52]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+["E"])
df.loc[dates[0]:dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.813313,5.0,,
2013-01-02,0.442253,-1.206131,1.178163,5.0,1.0,
2013-01-03,-0.106463,1.508199,-1.051491,5.0,2.0,
2013-01-04,2.734634,0.085089,-0.753946,5.0,3.0,


In [53]:
df1.dropna(how="any") #결측치 있으면 다 제거하는 옵션인듯?

Unnamed: 0,A,B,C,D,F,E


In [54]:
df1.fillna(value=5) #결측치를 특정 값으로 다 바꿔서 채우는 방법

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.813313,5.0,5.0,5.0
2013-01-02,0.442253,-1.206131,1.178163,5.0,1.0,5.0
2013-01-03,-0.106463,1.508199,-1.051491,5.0,2.0,5.0
2013-01-04,2.734634,0.085089,-0.753946,5.0,3.0,5.0


In [55]:
pd.isna(df1) #결측치가 있는가에 대한 T/F판단 있으면 T

CPU times: total: 0 ns
Wall time: 0 ns


Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,True
2013-01-02,False,False,False,False,False,True
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


자동으로 commit되게 yml추가해준 것 테스트용도