### Pandas
> Quick gothrough

In [1]:
!pip install pandas -qU


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


#### data class
<pre>
    - Series: 1d arr to hold any data type
    - Dataframe: 2d data structure to handle row or col
</pre>

In [2]:
import pandas as pd
import numpy as np

In [3]:
## series
s = pd.Series([11, 22, 33])
s

0    11
1    22
2    33
dtype: int64

In [4]:
## df
# create datetime index with date_range
dt = pd.date_range("20240105", periods=8)
print(dt)
print()
# create df with numpy arr and dt as index
df = pd.DataFrame(np.random.randn(8, 4), index=dt, columns=list("abcd"))
print(df)

DatetimeIndex(['2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12'],
              dtype='datetime64[ns]', freq='D')

                   a         b         c         d
2024-01-05 -1.211673  0.941568 -1.561177 -0.547745
2024-01-06  0.013368 -0.106951 -2.315565 -0.478239
2024-01-07 -0.596637  0.267147  2.130864 -0.515118
2024-01-08  2.399555  0.779150 -1.533278  0.380306
2024-01-09 -0.382796  1.197773 -0.038580  1.877805
2024-01-10 -2.625035 -1.551799  1.715700 -1.148536
2024-01-11 -1.103701 -0.229666 -0.870151  0.543942
2024-01-12  0.636748 -0.574155 -1.181298 -1.137014


In [6]:
## create df with dict
df_2 = pd.DataFrame(
    {
        "a": 1,
        "b": pd.Timestamp("20240105"),
        "c": pd.Series(1, index = list(range(5)), dtype="float16"),
        "d": pd.Categorical(["red", "green", "blue", "purple", "black"]),
        "e": "color",
    }
)
print(df_2)
print("-"*12)
print(df_2.dtypes)

   a          b    c       d      e
0  1 2024-01-05  1.0     red  color
1  1 2024-01-05  1.0   green  color
2  1 2024-01-05  1.0    blue  color
3  1 2024-01-05  1.0  purple  color
4  1 2024-01-05  1.0   black  color
------------
a            int64
b    datetime64[s]
c          float16
d         category
e           object
dtype: object


In [9]:
## view df
print(df.head()) # first 5 (default)
print("-"*50)
print(df.tail(2)) # last 2
print("-"*50)
print(df.index) # indexs
print("-"*50)
print(df.columns) # col names
print("-"*50)
print(df.describe) # summary of data

                   a         b         c         d
2024-01-05 -1.211673  0.941568 -1.561177 -0.547745
2024-01-06  0.013368 -0.106951 -2.315565 -0.478239
2024-01-07 -0.596637  0.267147  2.130864 -0.515118
2024-01-08  2.399555  0.779150 -1.533278  0.380306
2024-01-09 -0.382796  1.197773 -0.038580  1.877805
--------------------------------------------------
                   a         b         c         d
2024-01-11 -1.103701 -0.229666 -0.870151  0.543942
2024-01-12  0.636748 -0.574155 -1.181298 -1.137014
--------------------------------------------------
DatetimeIndex(['2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12'],
              dtype='datetime64[ns]', freq='D')
--------------------------------------------------
Index(['a', 'b', 'c', 'd'], dtype='object')
--------------------------------------------------
<bound method NDFrame.describe of                    a         b         c         d
2024-01-05 -1.211

In [10]:
## converts to np
df.to_numpy()

array([[-1.21167273,  0.94156809, -1.56117741, -0.5477446 ],
       [ 0.01336849, -0.10695109, -2.31556527, -0.47823925],
       [-0.59663657,  0.26714704,  2.1308638 , -0.51511805],
       [ 2.39955494,  0.77915024, -1.53327771,  0.38030592],
       [-0.38279636,  1.19777276, -0.03857967,  1.87780503],
       [-2.62503493, -1.55179926,  1.71569991, -1.1485359 ],
       [-1.10370118, -0.22966645, -0.87015126,  0.54394231],
       [ 0.6367482 , -0.57415534, -1.18129811, -1.13701358]])

In [12]:
## sorting
print(df_2.sort_index(axis=1, ascending=True))
print("-"*50)
print(df_2.sort_values(by="d")) # according to val of col d

   a          b    c       d      e
0  1 2024-01-05  1.0     red  color
1  1 2024-01-05  1.0   green  color
2  1 2024-01-05  1.0    blue  color
3  1 2024-01-05  1.0  purple  color
4  1 2024-01-05  1.0   black  color
--------------------------------------------------
   a          b    c       d      e
4  1 2024-01-05  1.0   black  color
2  1 2024-01-05  1.0    blue  color
1  1 2024-01-05  1.0   green  color
3  1 2024-01-05  1.0  purple  color
0  1 2024-01-05  1.0     red  color


In [13]:
## access data
print(df["a"]) # col a
print("-"*50)
print(df[0:2]) # row 0 and 1

2024-01-05   -1.211673
2024-01-06    0.013368
2024-01-07   -0.596637
2024-01-08    2.399555
2024-01-09   -0.382796
2024-01-10   -2.625035
2024-01-11   -1.103701
2024-01-12    0.636748
Freq: D, Name: a, dtype: float64
--------------------------------------------------
                   a         b         c         d
2024-01-05 -1.211673  0.941568 -1.561177 -0.547745
2024-01-06  0.013368 -0.106951 -2.315565 -0.478239


In [19]:
## select
df_2.loc[:, ["a"]]

Unnamed: 0,a
0,1
1,1
2,1
3,1
4,1


In [26]:
df.loc["20240105":"20240108", ["b", "c"]]

Unnamed: 0,b,c
2024-01-05,0.941568,-1.561177
2024-01-06,-0.106951,-2.315565
2024-01-07,0.267147,2.130864
2024-01-08,0.77915,-1.533278


In [32]:
df_2.at[0, "a"]

1

In [34]:
df.iloc[2] #index 2

a   -0.596637
b    0.267147
c    2.130864
d   -0.515118
Name: 2024-01-07 00:00:00, dtype: float64

In [37]:
df_2.iat[2, 2] # index 2 

1.0

In [38]:
df_2.iat[2, 3]

'blue'