# Object creation (2024-01-26)

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.081367,1.164689,0.999465,-0.395687
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728
2013-01-04,-0.466445,0.132922,0.749504,0.793348
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655
2013-01-06,0.257132,-0.661018,1.078451,1.87895


In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1,index=list(range(4)),dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", #특이한게 ,로 끝내도 되고 아닌 것으로 끝내도 되네?
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
#df2.<TAB> # noqa: E225, E999

#df2.까지 입력후 탭누르라는 의미, vscode에서는 탭안눌러도 자동으로 표시 단 컬럼은 안 표시됨

# Viewing data (2024-01-27)

In [8]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.081367,1.164689,0.999465,-0.395687
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728
2013-01-04,-0.466445,0.132922,0.749504,0.793348
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.466445,0.132922,0.749504,0.793348
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655
2013-01-06,0.257132,-0.661018,1.078451,1.87895


In [10]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.to_numpy()

array([[-1.08136667,  1.16468939,  0.99946502, -0.39568667],
       [-0.15103599, -1.22918968, -0.92726437,  0.66652055],
       [-0.85343854,  0.03602325, -2.21906666, -0.85272788],
       [-0.46644527,  0.13292174,  0.74950377,  0.79334795],
       [-0.63043479,  0.68250664, -1.12110624, -0.69765468],
       [ 0.25713156, -0.66101813,  1.07845108,  1.87894997]])

In [13]:
df.values

array([[-1.08136667,  1.16468939,  0.99946502, -0.39568667],
       [-0.15103599, -1.22918968, -0.92726437,  0.66652055],
       [-0.85343854,  0.03602325, -2.21906666, -0.85272788],
       [-0.46644527,  0.13292174,  0.74950377,  0.79334795],
       [-0.63043479,  0.68250664, -1.12110624, -0.69765468],
       [ 0.25713156, -0.66101813,  1.07845108,  1.87894997]])

In [14]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [15]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [16]:
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.487598,0.020989,-0.240003,0.232125
std,0.484798,0.870229,1.372522,1.063132
min,-1.081367,-1.22919,-2.219067,-0.852728
25%,-0.797688,-0.486758,-1.072646,-0.622163
50%,-0.54844,0.084472,-0.08888,0.135417
75%,-0.229888,0.54511,0.936975,0.761641
max,0.257132,1.164689,1.078451,1.87895


In [18]:
df.T
#행과 열을 바꿔주는데 미리 계산되어 있음
#메소드가 아닌 속성임

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.081367,-0.151036,-0.853439,-0.466445,-0.630435,0.257132
B,1.164689,-1.22919,0.036023,0.132922,0.682507,-0.661018
C,0.999465,-0.927264,-2.219067,0.749504,-1.121106,1.078451
D,-0.395687,0.666521,-0.852728,0.793348,-0.697655,1.87895


In [19]:
df.T.index
#값만 바꿔서 나타내는가해서 확인용으로 해줬는데 인덱스도 확실히 바뀌어있다

Index(['A', 'B', 'C', 'D'], dtype='object')

In [20]:
df.sort_index(axis=1, ascending=False)
#axis = 1 하면 열끼리 정렬하는 것

Unnamed: 0,D,C,B,A
2013-01-01,-0.395687,0.999465,1.164689,-1.081367
2013-01-02,0.666521,-0.927264,-1.22919,-0.151036
2013-01-03,-0.852728,-2.219067,0.036023,-0.853439
2013-01-04,0.793348,0.749504,0.132922,-0.466445
2013-01-05,-0.697655,-1.121106,0.682507,-0.630435
2013-01-06,1.87895,1.078451,-0.661018,0.257132


In [21]:
df.sort_values(by="B")
#df안에 있는 특정 열의 값에 대해서 정렬해줄 수 있음

Unnamed: 0,A,B,C,D
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521
2013-01-06,0.257132,-0.661018,1.078451,1.87895
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728
2013-01-04,-0.466445,0.132922,0.749504,0.793348
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655
2013-01-01,-1.081367,1.164689,0.999465,-0.395687


# Selection (2024-01-28)

## Getitem ([])

In [22]:
df["A"]

2013-01-01   -1.081367
2013-01-02   -0.151036
2013-01-03   -0.853439
2013-01-04   -0.466445
2013-01-05   -0.630435
2013-01-06    0.257132
Freq: D, Name: A, dtype: float64

In [23]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.081367,1.164689,0.999465,-0.395687
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728


In [24]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728
2013-01-04,-0.466445,0.132922,0.749504,0.793348


## Selection by label

In [25]:
df.loc[dates[0]]

A   -1.081367
B    1.164689
C    0.999465
D   -0.395687
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
df.loc[dates[0],:]

A   -1.081367
B    1.164689
C    0.999465
D   -0.395687
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,-1.081367,1.164689
2013-01-02,-0.151036,-1.22919
2013-01-03,-0.853439,0.036023
2013-01-04,-0.466445,0.132922
2013-01-05,-0.630435,0.682507
2013-01-06,0.257132,-0.661018


In [28]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,-0.151036,-1.22919
2013-01-03,-0.853439,0.036023
2013-01-04,-0.466445,0.132922


In [29]:
df.loc["20130102":"20130104",["B","A"]]

Unnamed: 0,B,A
2013-01-02,-1.22919,-0.151036
2013-01-03,0.036023,-0.853439
2013-01-04,0.132922,-0.466445


In [30]:
df.loc[dates[0],"A"]

-1.081366673299389

In [31]:
df.at[dates[0],"A"]

-1.081366673299389

## Selection by position

In [32]:
df.iloc[3]

A   -0.466445
B    0.132922
C    0.749504
D    0.793348
Name: 2013-01-04 00:00:00, dtype: float64

In [33]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.466445,0.132922
2013-01-05,-0.630435,0.682507


In [34]:
#비교 loc는 슬라이싱 할때 끝부분까지로 인식(끝부분 포함됨) iloc는 끝부분제외
#df.loc[3:5, 0:2] 아 맞다 이런 식으로 숫자 인덱스로 안됬다
df.loc["20130104":"20130105", "A":"B"]

Unnamed: 0,A,B
2013-01-04,-0.466445,0.132922
2013-01-05,-0.630435,0.682507


In [35]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.151036,-0.927264
2013-01-03,-0.853439,-2.219067
2013-01-05,-0.630435,-1.121106


In [36]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728


In [37]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,1.164689,0.999465
2013-01-02,-1.22919,-0.927264
2013-01-03,0.036023,-2.219067
2013-01-04,0.132922,0.749504
2013-01-05,0.682507,-1.121106
2013-01-06,-0.661018,1.078451


In [38]:
df.iloc[1,1]

-1.229189677971982

In [39]:
df.iat[1,1]

-1.229189677971982

## Boolean indexing (조건에 따른 컬럼 선택)

In [40]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-06,0.257132,-0.661018,1.078451,1.87895


In [41]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,1.164689,0.999465,
2013-01-02,,,,0.666521
2013-01-03,,0.036023,,
2013-01-04,,0.132922,0.749504,0.793348
2013-01-05,,0.682507,,
2013-01-06,0.257132,,1.078451,1.87895


In [42]:
df2=df.copy()
df2["E"]=["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.081367,1.164689,0.999465,-0.395687,one
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521,one
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728,two
2013-01-04,-0.466445,0.132922,0.749504,0.793348,three
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655,four
2013-01-06,0.257132,-0.661018,1.078451,1.87895,three


In [43]:
df2[df2["E"].isin(["two", "four"])]
#isin()안에 []로 넣어야하는 듯 (한 개의 경우에도)

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728,two
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655,four


## Setting

In [44]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [45]:
df["F"]=s1

In [46]:
df.at[dates[0],"A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,1.164689,0.999465,-0.395687,
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521,1.0
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728,2.0
2013-01-04,-0.466445,0.132922,0.749504,0.793348,3.0
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655,4.0
2013-01-06,0.257132,-0.661018,1.078451,1.87895,5.0


In [47]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.999465,-0.395687,
2013-01-02,-0.151036,-1.22919,-0.927264,0.666521,1.0
2013-01-03,-0.853439,0.036023,-2.219067,-0.852728,2.0
2013-01-04,-0.466445,0.132922,0.749504,0.793348,3.0
2013-01-05,-0.630435,0.682507,-1.121106,-0.697655,4.0
2013-01-06,0.257132,-0.661018,1.078451,1.87895,5.0


In [48]:
len(df)

6

In [49]:
df.loc[:, "D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.999465,5.0,
2013-01-02,-0.151036,-1.22919,-0.927264,5.0,1.0
2013-01-03,-0.853439,0.036023,-2.219067,5.0,2.0
2013-01-04,-0.466445,0.132922,0.749504,5.0,3.0
2013-01-05,-0.630435,0.682507,-1.121106,5.0,4.0
2013-01-06,0.257132,-0.661018,1.078451,5.0,5.0


In [50]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.999465,-5.0,
2013-01-02,-0.151036,-1.22919,-0.927264,-5.0,-1.0
2013-01-03,-0.853439,-0.036023,-2.219067,-5.0,-2.0
2013-01-04,-0.466445,-0.132922,-0.749504,-5.0,-3.0
2013-01-05,-0.630435,-0.682507,-1.121106,-5.0,-4.0
2013-01-06,-0.257132,-0.661018,-1.078451,-5.0,-5.0


# Missing data (2024-01-29)

In [53]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+["E"])
df.loc[dates[0]:dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E,E.1
2013-01-01,0.0,0.0,0.999465,5.0,,1.0,1.0
2013-01-02,-0.151036,-1.22919,-0.927264,5.0,1.0,1.0,1.0
2013-01-03,-0.853439,0.036023,-2.219067,5.0,2.0,,
2013-01-04,-0.466445,0.132922,0.749504,5.0,3.0,,


In [54]:
df1.dropna(how="any") #결측치 있으면 다 제거하는 옵션인듯?

Unnamed: 0,A,B,C,D,F,E,E.1
2013-01-02,-0.151036,-1.22919,-0.927264,5.0,1.0,1.0,1.0


In [56]:
df1.fillna(value=5) #결측치를 특정 값으로 다 바꿔서 채우는 방법

Unnamed: 0,A,B,C,D,F,E,E.1
2013-01-01,0.0,0.0,0.999465,5.0,5.0,1.0,1.0
2013-01-02,-0.151036,-1.22919,-0.927264,5.0,1.0,1.0,1.0
2013-01-03,-0.853439,0.036023,-2.219067,5.0,2.0,5.0,5.0
2013-01-04,-0.466445,0.132922,0.749504,5.0,3.0,5.0,5.0


In [57]:
pd.isna(df1) #결측치가 있는가에 대한 T/F판단 있으면 T

Unnamed: 0,A,B,C,D,F,E,E.1
2013-01-01,False,False,False,False,True,False,False
2013-01-02,False,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True,True
2013-01-04,False,False,False,False,False,True,True


자동으로 commit되게 yml추가해준 것 테스트용도