# Object creation (2024-01-26)

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.716914,0.273037,-2.441265,0.707984
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706
2013-01-06,0.014749,0.97266,0.666272,1.20451


In [5]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1,index=list(range(4)),dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", #특이한게 ,로 끝내도 되고 아닌 것으로 끝내도 되네?
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
#df2.<TAB> # noqa: E225, E999

#df2.까지 입력후 탭누르라는 의미, vscode에서는 탭안눌러도 자동으로 표시 단 컬럼은 안 표시됨

# Viewing data (2024-01-27)

In [8]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.716914,0.273037,-2.441265,0.707984
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706
2013-01-06,0.014749,0.97266,0.666272,1.20451


In [10]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.to_numpy()

array([[ 0.71691391,  0.273037  , -2.44126487,  0.70798387],
       [ 0.30227352, -0.58566318, -0.07483423, -0.35980033],
       [-0.01453763, -0.23816834, -0.31496491, -0.76692421],
       [-1.0028117 , -0.61225181, -0.22893047,  0.42756568],
       [-0.93988949, -1.71408598, -0.92384433,  0.84370636],
       [ 0.01474919,  0.97266048,  0.66627247,  1.2045101 ]])

In [13]:
df.values

array([[ 0.71691391,  0.273037  , -2.44126487,  0.70798387],
       [ 0.30227352, -0.58566318, -0.07483423, -0.35980033],
       [-0.01453763, -0.23816834, -0.31496491, -0.76692421],
       [-1.0028117 , -0.61225181, -0.22893047,  0.42756568],
       [-0.93988949, -1.71408598, -0.92384433,  0.84370636],
       [ 0.01474919,  0.97266048,  0.66627247,  1.2045101 ]])

In [14]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [15]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [16]:
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.153884,-0.317412,-0.552928,0.34284
std,0.68596,0.908712,1.056411,0.756192
min,-1.002812,-1.714086,-2.441265,-0.766924
25%,-0.708552,-0.605605,-0.771624,-0.162959
50%,0.000106,-0.411916,-0.271948,0.567775
75%,0.230392,0.145236,-0.113358,0.809776
max,0.716914,0.97266,0.666272,1.20451


In [18]:
df.T
#행과 열을 바꿔주는데 미리 계산되어 있음
#메소드가 아닌 속성임

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.716914,0.302274,-0.014538,-1.002812,-0.939889,0.014749
B,0.273037,-0.585663,-0.238168,-0.612252,-1.714086,0.97266
C,-2.441265,-0.074834,-0.314965,-0.22893,-0.923844,0.666272
D,0.707984,-0.3598,-0.766924,0.427566,0.843706,1.20451


In [19]:
df.T.index
#값만 바꿔서 나타내는가해서 확인용으로 해줬는데 인덱스도 확실히 바뀌어있다

Index(['A', 'B', 'C', 'D'], dtype='object')

In [20]:
df.sort_index(axis=1, ascending=False)
#axis = 1 하면 열끼리 정렬하는 것

Unnamed: 0,D,C,B,A
2013-01-01,0.707984,-2.441265,0.273037,0.716914
2013-01-02,-0.3598,-0.074834,-0.585663,0.302274
2013-01-03,-0.766924,-0.314965,-0.238168,-0.014538
2013-01-04,0.427566,-0.22893,-0.612252,-1.002812
2013-01-05,0.843706,-0.923844,-1.714086,-0.939889
2013-01-06,1.20451,0.666272,0.97266,0.014749


In [21]:
df.sort_values(by="B")
#df안에 있는 특정 열의 값에 대해서 정렬해줄 수 있음

Unnamed: 0,A,B,C,D
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924
2013-01-01,0.716914,0.273037,-2.441265,0.707984
2013-01-06,0.014749,0.97266,0.666272,1.20451


# Selection (2024-01-28)

## Getitem ([])

In [22]:
df["A"]

2013-01-01    0.716914
2013-01-02    0.302274
2013-01-03   -0.014538
2013-01-04   -1.002812
2013-01-05   -0.939889
2013-01-06    0.014749
Freq: D, Name: A, dtype: float64

In [23]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.716914,0.273037,-2.441265,0.707984
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924


In [24]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566


## Selection by label

In [25]:
df.loc[dates[0]]

A    0.716914
B    0.273037
C   -2.441265
D    0.707984
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
df.loc[dates[0],:]

A    0.716914
B    0.273037
C   -2.441265
D    0.707984
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,0.716914,0.273037
2013-01-02,0.302274,-0.585663
2013-01-03,-0.014538,-0.238168
2013-01-04,-1.002812,-0.612252
2013-01-05,-0.939889,-1.714086
2013-01-06,0.014749,0.97266


In [28]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,0.302274,-0.585663
2013-01-03,-0.014538,-0.238168
2013-01-04,-1.002812,-0.612252


In [29]:
df.loc["20130102":"20130104",["B","A"]]

Unnamed: 0,B,A
2013-01-02,-0.585663,0.302274
2013-01-03,-0.238168,-0.014538
2013-01-04,-0.612252,-1.002812


In [30]:
df.loc[dates[0],"A"]

0.716913911084348

In [31]:
df.at[dates[0],"A"]

0.716913911084348

## Selection by position

In [32]:
df.iloc[3]

A   -1.002812
B   -0.612252
C   -0.228930
D    0.427566
Name: 2013-01-04 00:00:00, dtype: float64

In [33]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.002812,-0.612252
2013-01-05,-0.939889,-1.714086


In [34]:
#비교 loc는 슬라이싱 할때 끝부분까지로 인식(끝부분 포함됨) iloc는 끝부분제외
#df.loc[3:5, 0:2] 아 맞다 이런 식으로 숫자 인덱스로 안됬다
df.loc["20130104":"20130105", "A":"B"]

Unnamed: 0,A,B
2013-01-04,-1.002812,-0.612252
2013-01-05,-0.939889,-1.714086


In [35]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.302274,-0.074834
2013-01-03,-0.014538,-0.314965
2013-01-05,-0.939889,-0.923844


In [36]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924


In [37]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.273037,-2.441265
2013-01-02,-0.585663,-0.074834
2013-01-03,-0.238168,-0.314965
2013-01-04,-0.612252,-0.22893
2013-01-05,-1.714086,-0.923844
2013-01-06,0.97266,0.666272


In [38]:
df.iloc[1,1]

-0.5856631798327985

In [39]:
df.iat[1,1]

-0.5856631798327985

## Boolean indexing (조건에 따른 컬럼 선택)

In [40]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.716914,0.273037,-2.441265,0.707984
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598
2013-01-06,0.014749,0.97266,0.666272,1.20451


In [41]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.716914,0.273037,,0.707984
2013-01-02,0.302274,,,
2013-01-03,,,,
2013-01-04,,,,0.427566
2013-01-05,,,,0.843706
2013-01-06,0.014749,0.97266,0.666272,1.20451


In [42]:
df2=df.copy()
df2["E"]=["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.716914,0.273037,-2.441265,0.707984,one
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598,one
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924,two
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566,three
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706,four
2013-01-06,0.014749,0.97266,0.666272,1.20451,three


In [43]:
df2[df2["E"].isin(["two", "four"])]
#isin()안에 []로 넣어야하는 듯 (한 개의 경우에도)

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924,two
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706,four


## Setting

In [44]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [45]:
df["F"]=s1

In [46]:
df.at[dates[0],"A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.273037,-2.441265,0.707984,
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598,1.0
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924,2.0
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566,3.0
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706,4.0
2013-01-06,0.014749,0.97266,0.666272,1.20451,5.0


In [47]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-2.441265,0.707984,
2013-01-02,0.302274,-0.585663,-0.074834,-0.3598,1.0
2013-01-03,-0.014538,-0.238168,-0.314965,-0.766924,2.0
2013-01-04,-1.002812,-0.612252,-0.22893,0.427566,3.0
2013-01-05,-0.939889,-1.714086,-0.923844,0.843706,4.0
2013-01-06,0.014749,0.97266,0.666272,1.20451,5.0


In [48]:
len(df)

6

In [49]:
df.loc[:, "D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-2.441265,5.0,
2013-01-02,0.302274,-0.585663,-0.074834,5.0,1.0
2013-01-03,-0.014538,-0.238168,-0.314965,5.0,2.0
2013-01-04,-1.002812,-0.612252,-0.22893,5.0,3.0
2013-01-05,-0.939889,-1.714086,-0.923844,5.0,4.0
2013-01-06,0.014749,0.97266,0.666272,5.0,5.0


In [50]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-2.441265,-5.0,
2013-01-02,-0.302274,-0.585663,-0.074834,-5.0,-1.0
2013-01-03,-0.014538,-0.238168,-0.314965,-5.0,-2.0
2013-01-04,-1.002812,-0.612252,-0.22893,-5.0,-3.0
2013-01-05,-0.939889,-1.714086,-0.923844,-5.0,-4.0
2013-01-06,-0.014749,-0.97266,-0.666272,-5.0,-5.0


# Missing data (2024-01-29)

In [51]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+["E"])
df.loc[dates[0]:dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-2.441265,5.0,,
2013-01-02,0.302274,-0.585663,-0.074834,5.0,1.0,
2013-01-03,-0.014538,-0.238168,-0.314965,5.0,2.0,
2013-01-04,-1.002812,-0.612252,-0.22893,5.0,3.0,


In [52]:
df1.dropna(how="any") #결측치 있으면 다 제거하는 옵션인듯?

Unnamed: 0,A,B,C,D,F,E


In [53]:
df1.fillna(value=5) #결측치를 특정 값으로 다 바꿔서 채우는 방법

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-2.441265,5.0,5.0,5.0
2013-01-02,0.302274,-0.585663,-0.074834,5.0,1.0,5.0
2013-01-03,-0.014538,-0.238168,-0.314965,5.0,2.0,5.0
2013-01-04,-1.002812,-0.612252,-0.22893,5.0,3.0,5.0


In [54]:
pd.isna(df1) #결측치가 있는가에 대한 T/F판단 있으면 T

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,True
2013-01-02,False,False,False,False,False,True
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# Operations (2024-01-30)

## Stats

In [55]:
df

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-2.441265,5.0,,1.0
2013-01-02,0.302274,-0.585663,-0.074834,5.0,1.0,1.0
2013-01-03,-0.014538,-0.238168,-0.314965,5.0,2.0,
2013-01-04,-1.002812,-0.612252,-0.22893,5.0,3.0,
2013-01-05,-0.939889,-1.714086,-0.923844,5.0,4.0,
2013-01-06,0.014749,0.97266,0.666272,5.0,5.0,


In [56]:
df.mean() #NaN값있으면 합산, 갯수에서 제외하고 평균 내는 듯

A   -0.273369
B   -0.362918
C   -0.552928
D    5.000000
F    3.000000
E    1.000000
dtype: float64

In [57]:
print(df.mean(axis=1)) 
# 마찬가지로 NaN값있으면 합산, 갯수에서 제외하고 평균 내는 듯, 단 axis=1이면 행 별로 모든 컬럼의 값의 평균 내줌
# axis = 1말고 그냥 인자로 1만 줘도 동일한 결과가 나오는 듯 함

2013-01-01    0.711747
2013-01-02    1.106963
2013-01-03    1.286466
2013-01-04    1.231201
2013-01-05    1.084436
2013-01-06    2.330736
Freq: D, dtype: float64


In [58]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [59]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

>https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html#pandas.Series     
Series의 메소드인 shift가 뭐하는 녀석인지 몰라서 문서에 들어가보니 아래와 같이 서술되어있었다   
shift([periods, freq, axis, fill_value, suffix])    
Shift index by desired number of periods with an optional time freq.    
자세한 내용은 모르겠지만 대강 값들을 index를 기준으로 준 값만큼 이동시키는 듯하다   
1일과 2일의 경우 이전 인덱스에 해당하는 데이터가 없으니 NaN로 결측치가 된 것이고    
3,4,5일의 경우 인덱스상으로 2이전의 값들이 존재했으니 해당값들로 바뀌었고   
6일의 경우 인덱스상으로 2이전의 값이 NaN로 결측치 였기에 결측치로 되었는 듯하다.



In [60]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,,,,,,
2013-01-02,,,,,,
2013-01-03,-1.014538,-1.238168,-1.314965,4.0,1.0,
2013-01-04,-4.002812,-3.612252,-3.22893,2.0,0.0,
2013-01-05,-5.939889,-6.714086,-5.923844,0.0,-1.0,
2013-01-06,,,,,,


>https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html#pandas.DataFrame
이번엔 df의 메소드인 sub에 대하여 설명이 없어서 찾아보니 아래와 같이 메소드로 설명되어있었고 잘 이해가 되지 않아 좀더 자세한 독스에 들어갔다    
sub(other[, axis, level, fill_value])       
Get Subtraction of dataframe and other, element-wise (binary operator sub).     
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sub.html#pandas.DataFrame.sub   
DataFrame.sub(other, axis='columns', level=None, fill_value=None)   
대강 구조를 보니 df에 대하여 빼기를 해주는데 sub()안에 들어있는 값으로 빼주는 것으로 들어올 수 있는 other로는 scalar, sequence, Series, dict or DataFrame이 가능하다고 한다 이중 sequence는 정확히 뭔지 모르겠어서 찾아보니     
https://wikidocs.net/84391      
내용을 참고하였다 대강 리스트, 문자열, 튜플 이 세가지 자료형이라고 생각하면 될듯하다    
그리고 axis에 대하여 0 or index , 1 or columns의 옵션이 있는데 기본값은 colunms로 되어있다  
fill_value는 결측치를 만났을 때 특정값으로 바꿔넣어줄 것인지 선택하는 옵션으로 기본값은 none로 되어있다     
level 옵션에 대해서는 잘모르겠다 예제도 한문제 뿐인데 잘 이해가 되지 않는다 대강 멀티인덱스를 사용할 경우에 사용하는 느낌인데 예제를 아래 그대로 가져와서 해보고 이해해보도록 하겠다

### level 예제

In [61]:
level_example_df = pd.DataFrame({'angles': [0, 3, 4],
                   'degrees': [360, 180, 360]},
                  index=['circle', 'triangle', 'rectangle'])
level_example_df

Unnamed: 0,angles,degrees
circle,0,360
triangle,3,180
rectangle,4,360


In [62]:
level_example_df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
                             'degrees': [360, 180, 360, 360, 540, 720]},
                            index=[['A', 'A', 'A', 'B', 'B', 'B'],
                                   ['circle', 'triangle', 'rectangle',
                                    'square', 'pentagon', 'hexagon']])
level_example_df_multindex

Unnamed: 0,Unnamed: 1,angles,degrees
A,circle,0,360
A,triangle,3,180
A,rectangle,4,360
B,square,4,360
B,pentagon,5,540
B,hexagon,6,720


In [63]:
print(level_example_df.div(level_example_df_multindex, level=1))
#원래 예제에는 ,fill_value=0으로 결측치 0으로 바꿔주는 옵션이 추가되어있었다
#뭔가 알듯말듯한데 chatGPT를 통해 설명을 들으니 알 것 같았다
#https://chat.openai.com/share/73654015-96b4-4fb3-b6e3-243526afb670

             angles  degrees
A circle        NaN      1.0
  triangle      1.0      1.0
  rectangle     1.0      1.0
B square        NaN      NaN
  pentagon      NaN      NaN
  hexagon       NaN      NaN


## User defined functions

In [64]:
display(df)
print(df.mean(),'\n')
print(df.mean()*5.6)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-2.441265,5.0,,1.0
2013-01-02,0.302274,-0.585663,-0.074834,5.0,1.0,1.0
2013-01-03,-0.014538,-0.238168,-0.314965,5.0,2.0,
2013-01-04,-1.002812,-0.612252,-0.22893,5.0,3.0,
2013-01-05,-0.939889,-1.714086,-0.923844,5.0,4.0,
2013-01-06,0.014749,0.97266,0.666272,5.0,5.0,


A   -0.273369
B   -0.362918
C   -0.552928
D    5.000000
F    3.000000
E    1.000000
dtype: float64 

A    -1.530868
B    -2.032342
C    -3.096395
D    28.000000
F    16.800000
E     5.600000
dtype: float64


In [65]:
df.agg(lambda x: np.mean(x)*5.6)

A    -1.530868
B    -2.032342
C    -3.096395
D    28.000000
F    16.800000
E     5.600000
dtype: float64

In [66]:
df*101.2

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-247.056004,506.0,,101.2
2013-01-02,30.59008,-59.269114,-7.573224,506.0,101.2,101.2
2013-01-03,-1.471208,-24.102636,-31.874449,506.0,202.4,
2013-01-04,-101.484544,-61.959883,-23.167763,506.0,303.6,
2013-01-05,-95.116816,-173.465501,-93.493046,506.0,404.8,
2013-01-06,1.492618,98.43324,67.426774,506.0,506.0,


In [67]:
df.transform(lambda x: x*101.2)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-247.056004,506.0,,101.2
2013-01-02,30.59008,-59.269114,-7.573224,506.0,101.2,101.2
2013-01-03,-1.471208,-24.102636,-31.874449,506.0,202.4,
2013-01-04,-101.484544,-61.959883,-23.167763,506.0,303.6,
2013-01-05,-95.116816,-173.465501,-93.493046,506.0,404.8,
2013-01-06,1.492618,98.43324,67.426774,506.0,506.0,


agg는 대강은 알듯한데 transform은 agg와 뭐가 다른지 모르겠다    
추가로 유사한 것에 apply도 있던 것으로 기억하는데 여기는 왜 빠져있고 차이가 뭔지도 궁금하다     
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html#pandas.DataFrame.agg   
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transform.html#pandas.DataFrame.transform   
https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#gotchas-udf-mutation   
chatGPT에게 차이를 물어본 내용:
https://chat.openai.com/share/654feb01-8d50-481e-8c7a-ca160e1e74a4

## Value Counts

In [68]:
s = pd.Series(np.random.randint(0,7,size=10))
s

0    4
1    2
2    4
3    3
4    1
5    3
6    0
7    5
8    3
9    1
dtype: int32

In [69]:
s.value_counts()

3    3
4    2
1    2
2    1
0    1
5    1
Name: count, dtype: int64

## String Methods

In [70]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
print(s)
s.str.lower()

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object


0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

pandas.Series:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html#pandas.Series     
Vectorized String Methods:
https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-string-methods

# Merge (2024-01-31)

## Concat

In [71]:
df = pd.DataFrame(np.random.randn(10,4)) 
#np.random.randn(m,n) : 평균0, 표준편차1의 가우시안 표준정규분포 난수를 matrix array(m,n) 생성
df

Unnamed: 0,0,1,2,3
0,-1.170889,0.487327,0.707648,1.26617
1,-0.913657,-0.232166,0.813729,0.094852
2,1.023905,-0.674453,0.379961,0.970089
3,0.955911,-1.407659,-0.501972,-1.287975
4,1.124505,-0.321647,-2.719159,0.488784
5,0.782004,0.058737,-2.559779,0.020607
6,-1.432279,1.998106,0.834083,-2.637372
7,-0.664076,-0.520119,1.090442,-0.316528
8,1.269289,0.832964,0.586688,0.539146
9,0.311237,-0.808292,-0.902303,0.632382


np.random.randn(m,n) https://nittaku.tistory.com/443    
docs : https://numpy.org/doc/stable/reference/random/generated/numpy.random.randn.html

In [72]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -1.170889  0.487327  0.707648  1.266170
 1 -0.913657 -0.232166  0.813729  0.094852
 2  1.023905 -0.674453  0.379961  0.970089,
           0         1         2         3
 3  0.955911 -1.407659 -0.501972 -1.287975
 4  1.124505 -0.321647 -2.719159  0.488784
 5  0.782004  0.058737 -2.559779  0.020607
 6 -1.432279  1.998106  0.834083 -2.637372,
           0         1         2         3
 7 -0.664076 -0.520119  1.090442 -0.316528
 8  1.269289  0.832964  0.586688  0.539146
 9  0.311237 -0.808292 -0.902303  0.632382]

In [73]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-1.170889,0.487327,0.707648,1.26617
1,-0.913657,-0.232166,0.813729,0.094852
2,1.023905,-0.674453,0.379961,0.970089
3,0.955911,-1.407659,-0.501972,-1.287975
4,1.124505,-0.321647,-2.719159,0.488784
5,0.782004,0.058737,-2.559779,0.020607
6,-1.432279,1.998106,0.834083,-2.637372
7,-0.664076,-0.520119,1.090442,-0.316528
8,1.269289,0.832964,0.586688,0.539146
9,0.311237,-0.808292,-0.902303,0.632382


## Join

In [74]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [75]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [76]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [77]:
left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [78]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [79]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


# Grouping (2024-02-01)

Splitting the data into groups based on some criteria

Applying a function to each group independently

Combining the results into a data structure

In [80]:
df = pd.DataFrame(
    {"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], 
     "B": ["one", "one", "two", "three", "two", "two", "one", "three"], 
     "C": np.random.randn(8),
     "D": np.random.randn(8),}
)
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.848679,0.962346
1,bar,one,0.245838,-1.355163
2,foo,two,-0.328744,-0.628344
3,bar,three,-0.364245,-0.077179
4,foo,two,0.678883,-0.008561
5,bar,two,-1.205159,-0.96372
6,foo,one,-0.991294,-1.38674
7,foo,three,-1.766805,0.860827


In [81]:
df.groupby("A")[["C", "D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.323566,-2.396062
foo,-3.256638,-0.200471


In [82]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.245838,-1.355163
bar,three,-0.364245,-0.077179
bar,two,-1.205159,-0.96372
foo,one,-1.839973,-0.424394
foo,three,-1.766805,0.860827
foo,two,0.350139,-0.636905


# Reshaping (2024-02-02)

## Stack

In [83]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], 
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names = ["first", "second"])
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=["A", "B"])
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.101207,-1.084516
bar,two,0.674288,-1.562086
baz,one,0.260933,1.287588
baz,two,0.550672,0.035656


In [84]:
stacked = df2.stack() #future_stack=True
stacked #왜인지는 모르겠지만 저 옵션 빼줘야 제대로 실행됨 결과는 그리 다르지 않은 것으로 보임 나중에 질문해보기

first  second   
bar    one     A   -1.101207
               B   -1.084516
       two     A    0.674288
               B   -1.562086
baz    one     A    0.260933
               B    1.287588
       two     A    0.550672
               B    0.035656
dtype: float64

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.stack.html#pandas.DataFrame.stack   
부분을 보니 pandas2.0에서 pandas 3.0의 방식을 사용할 때 해주던 방식으로 유추되는데

In [85]:
pd.__version__

'2.0.3'

2.0버전 쓰고 있는 것이 맞는데 왜 안되는 거지..?

In [86]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.101207,-1.084516
bar,two,0.674288,-1.562086
baz,one,0.260933,1.287588
baz,two,0.550672,0.035656


In [87]:
stacked.unstack(1) # 행열 바꿔서 해제하는듯?

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-1.101207,0.674288
bar,B,-1.084516,-1.562086
baz,A,0.260933,0.550672
baz,B,1.287588,0.035656


In [88]:
stacked.unstack(0) # 무슨 방식으로 바꾸는지는 모르겠지만 째든 바꿔서 해제하는듯

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-1.101207,0.260933
one,B,-1.084516,1.287588
two,A,0.674288,0.550672
two,B,-1.562086,0.035656


## Pivot tables

In [89]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.037157,0.614642
1,one,B,foo,0.941722,0.484342
2,two,C,foo,-1.416276,1.61607
3,three,A,bar,0.036867,0.425058
4,one,B,bar,-0.726201,0.01549
5,one,C,bar,1.750834,-1.764452
6,two,A,foo,1.084953,-0.382373
7,three,B,foo,0.447517,-0.409641
8,one,C,foo,-0.427234,-1.706278
9,one,A,bar,-1.717741,-1.124292


In [90]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-1.717741,0.037157
one,B,-0.726201,0.941722
one,C,1.750834,-0.427234
three,A,0.036867,
three,B,,0.447517
three,C,-0.17413,
two,A,,1.084953
two,B,0.295695,
two,C,,-1.416276


# Time series (2024-02-03)

시계열 데이터에서 1초 마다 측정된 데이터를 5분 마다 측정된 데이터의 형태로 바꾸고 싶을 때 어떻게 해야하는지     
그리고 그 시계열 단위인 주기(frequency)를 다시 샘플링 할 수 있음

In [91]:
rng = pd.date_range("1/1/2012", periods=100, freq ="s") #대강 시작기준날짜, 몇개, (데이터간 차이나는)단위 느낌인듯하다
ts = pd.Series(np.random.randint(0,500,len(rng)), index=rng)
ts.resample("5Min").sum()

2012-01-01    24069
Freq: 5T, dtype: int32

#### resample에 대하여

In [92]:
idx = pd.date_range("2018-01-01", periods=5, freq="h")
ts2 = pd.Series(range(len(idx)), index=idx)
ts2

2018-01-01 00:00:00    0
2018-01-01 01:00:00    1
2018-01-01 02:00:00    2
2018-01-01 03:00:00    3
2018-01-01 04:00:00    4
Freq: H, dtype: int64

In [93]:
ts2.resample("2h").mean()

2018-01-01 00:00:00    0.5
2018-01-01 02:00:00    2.5
2018-01-01 04:00:00    4.0
Freq: 2H, dtype: float64

In [94]:
print(type(ts2))

<class 'pandas.core.series.Series'>


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.resample.html#pandas.Series.resample   
우선 2.0.0버전 이후로는 사라지게 될 듯한 기능이며 후에는 frame.T.resample을 대신 사용하라고 한다    
(지금 내가 사용하는 pandas 버전이 2.0.3인데 아직 존재는 하는듯하다?)    
resample: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-resampling     
흠 frame.T.resample이 뭔지 궁금해서 검색했는데 따로 나오지는 않은 것같다    
(Dataframe.T정도는 있긴한데 다른내용으로 행과 열을 바꿔주는 transpose를 의미하는 듯하다)   
그리고 그냥 series, df, df의 groupby했을때 series와 df 모두에 사용가능한 듯하다     
그런데 쓰이는 파라미터의 rule에 대해서 어떤 것을 쓸 수 있는 지 궁금해서 알고 싶은데 그냥 DataOffset, Timedelta or str로 적혀있다    
현재 예제들로 확인 된 것은 str으로 적을 때 숫자+ 시간은 'h', 초는 's', 분은 month의 m과 중복되서 그런지 'min'로 쓰이는 것을 확인하였다      
관련한 듯한 내용: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects   
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.tseries.offsets.DateOffset.html#pandas.tseries.offsets.DateOffset      
timedelta: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.html#pandas.Timedelta   
GPT한테 물어봐도 https://chat.openai.com/share/4e050491-79e0-44a5-bd1d-38a4f2d1e92f     
크게 다르진 않는 듯하다     
그냥 관련한 듯한 내용에 적힌 string부분을 참고하는 것만으로도 충분한듯 하다

In [95]:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")
ts = pd.Series(np.random.random(len(rng)),rng)
ts

2012-03-06    0.399281
2012-03-07    0.503173
2012-03-08    0.484813
2012-03-09    0.289874
2012-03-10    0.403512
Freq: D, dtype: float64

In [96]:
ts_utc = ts.tz_localize("UTC")
ts_utc

2012-03-06 00:00:00+00:00    0.399281
2012-03-07 00:00:00+00:00    0.503173
2012-03-08 00:00:00+00:00    0.484813
2012-03-09 00:00:00+00:00    0.289874
2012-03-10 00:00:00+00:00    0.403512
Freq: D, dtype: float64

In [97]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00    0.399281
2012-03-06 19:00:00-05:00    0.503173
2012-03-07 19:00:00-05:00    0.484813
2012-03-08 19:00:00-05:00    0.289874
2012-03-09 19:00:00-05:00    0.403512
Freq: D, dtype: float64

In [98]:
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [99]:
rng + pd.offsets.BusinessDay(5)

DatetimeIndex(['2012-03-13', '2012-03-14', '2012-03-15', '2012-03-16',
               '2012-03-16'],
              dtype='datetime64[ns]', freq=None)

# Categoricals (2024-02-04)

In [101]:
df = pd.DataFrame(
    {"id": [1,2,3,4,5,6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [102]:
df["grade"] = df["raw_grade"].astype("category") # 타입을 category로 변경하여 새로운 컬럼으로써 추가
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [106]:
new_categorics = ["very good", "good", "very bad"] # 변경해줄 새로운 카테고리들
df["grade"] = df["grade"].cat.rename_categories(new_categorics) # 기존 카테고리를 새로운 카테고리로 변경

In [107]:
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [108]:
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
)
df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.cat.html#pandas.Series.cat 
.cat.rename_categories(new_categorics)는 기존 카테고리를 새로운 카테고리로 변경하는 것이고  
.cat.set_categories(new_categorics)는 기존 카테고리를 새로운 카테고리로 변경하면서 추가로 없던 카테고리도 추가해줄 수 있다(카테고리 갯수를 늘릴 수 있다)

In [109]:
df.sort_values(by="grade") # 대신 정렬의 기준은 어휘적 순서가 아닌 범주에서 매겨진 값의 순서대로 순서가 매겨짐

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [110]:
display(df.groupby("grade", observed=False).size()) 
# size로 크기를 구할 수 있기 때문에 이 방식을 통해 각 범주에 해당되는 값의 빈도수도 확인할 수 있다
# observed=False하면 카테고리에 대해 0인 것도 표시가 된다

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64