# Panda Refresher & Missing Values Treatment

### Loading Libraries

In [3]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# OS
import os

from pathlib import Path

# Notebook Optimizer
from tqdm.auto import tqdm

# Itertools
from itertools import cycle

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# !pip install -U kaleido

In [7]:
pio.templates.default = "plotly_white"

In [10]:
# %autoreload 2
    
# %load_ext autoreload

In [11]:
tqdm.pandas()

np.random.seed()

In [12]:
SAVE_FIGURES = False
if SAVE_FIGURES:
    os.makedirs("imgs/chap_02", exist_ok=True)

## Loading Data

### Pandas Datetime Operations, Indexing, & Slicing – A refresher

In [14]:
df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/00247/data_akbilgic.xlsx",
                   skiprows=1)

df.head()


Unknown extension is not supported and will be removed



Unnamed: 0,date,ISE,ISE.1,SP,DAX,FTSE,NIKKEI,BOVESPA,EU,EM
0,2009-01-05,0.035754,0.038376,-0.004679,0.002193,0.003894,0.0,0.03119,0.012698,0.028524
1,2009-01-06,0.025426,0.031813,0.007787,0.008455,0.012866,0.004162,0.01892,0.011341,0.008773
2,2009-01-07,-0.028862,-0.026353,-0.030469,-0.017833,-0.028735,0.017293,-0.035899,-0.017073,-0.020015
3,2009-01-08,-0.062208,-0.084716,0.003391,-0.011726,-0.000466,-0.040061,0.028283,-0.005561,-0.019424
4,2009-01-09,0.00986,0.009658,-0.021533,-0.019873,-0.01271,-0.004474,-0.009764,-0.010989,-0.007802


### Converting `date columns` to `pd.Timestamp/DatetimeIndex`

In [15]:
pd.to_datetime("13-4-1987").strftime("%d, %B %Y")





'13, April 1987'

In [16]:
pd.to_datetime("4-1-1987").strftime("%d, %B %Y")

'01, April 1987'

In [17]:
pd.to_datetime("4-1-1987", dayfirst=True).strftime("%d, %B %Y")

'04, January 1987'

In [18]:
pd.to_datetime("4|1|1987", format="%d|%m|%Y").strftime("%d, %B %Y")

'04, January 1987'

In [19]:
# Placing on Year First
df['date'] = pd.to_datetime(df['date'], yearfirst=True)
df['date'].dtype

dtype('<M8[ns]')

In [20]:
# Min-Max Date
df.date.min(),df.date.max()

(Timestamp('2009-01-05 00:00:00'), Timestamp('2011-02-22 00:00:00'))

#### `dt accessor` and `datetime` Properties

In [21]:
df.head()

Unnamed: 0,date,ISE,ISE.1,SP,DAX,FTSE,NIKKEI,BOVESPA,EU,EM
0,2009-01-05,0.035754,0.038376,-0.004679,0.002193,0.003894,0.0,0.03119,0.012698,0.028524
1,2009-01-06,0.025426,0.031813,0.007787,0.008455,0.012866,0.004162,0.01892,0.011341,0.008773
2,2009-01-07,-0.028862,-0.026353,-0.030469,-0.017833,-0.028735,0.017293,-0.035899,-0.017073,-0.020015
3,2009-01-08,-0.062208,-0.084716,0.003391,-0.011726,-0.000466,-0.040061,0.028283,-0.005561,-0.019424
4,2009-01-09,0.00986,0.009658,-0.021533,-0.019873,-0.01271,-0.004474,-0.009764,-0.010989,-0.007802


In [22]:
print(f"""
Date: {df.date.iloc[0]}
Day of year: {df.date.dt.day_of_year.iloc[0]}
Day of week: {df.date.dt.dayofweek.iloc[0]}
Month: {df.date.dt.month.iloc[0]}
Month Name: {df.date.dt.month_name().iloc[0]}
Quarter: {df.date.dt.quarter.iloc[0]}
Year: {df.date.dt.year.iloc[0]}
ISO Week: {df.date.dt.isocalendar().week.iloc[0]}
""")


Date: 2009-01-05 00:00:00
Day of year: 5
Day of week: 0
Month: 1
Month Name: January
Quarter: 1
Year: 2009
ISO Week: 2



#### Slicing and Indexing

In [23]:
df.set_index("date", inplace=True)

# Select all data after 2010-01-04(including)
df["2010-01-04":]

# Select all data between 2010-01-04 and 2010-02-06(not including)
df["2010-01-04": "2010-02-06"]

# Select data 2010 and before
df[: "2010"]

# Select data between 2010-01 and 2010-06(both including)
df["2010-01": "2010-06"]

Unnamed: 0_level_0,ISE,ISE.1,SP,DAX,FTSE,NIKKEI,BOVESPA,EU,EM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,0.010229,0.014478,0.015916,0.000000,0.016018,0.000000,0.000000,0.016778,0.008399
2010-01-05,0.013898,0.024019,0.003111,-0.002722,0.004028,0.002535,0.002780,0.001316,0.008067
2010-01-06,0.007957,0.005706,0.000545,0.000409,0.001357,0.004635,0.006938,0.000586,0.005330
2010-01-07,0.007772,0.007498,0.003993,-0.002484,-0.000597,-0.004650,-0.003938,-0.000360,-0.006375
2010-01-08,-0.003189,0.000835,0.002878,0.003027,0.001356,0.010862,-0.002672,0.003554,0.001229
...,...,...,...,...,...,...,...,...,...
2010-06-24,0.000166,-0.008024,-0.016946,-0.014455,-0.015236,0.000467,-0.018948,-0.017801,-0.006717
2010-06-25,-0.000920,-0.003591,0.002855,-0.007366,-0.010585,-0.019411,0.013778,-0.007954,-0.002311
2010-06-28,0.010132,0.016951,-0.002036,0.014168,0.004981,-0.004481,-0.009283,0.009125,0.000540
2010-06-29,-0.021816,-0.028443,-0.031508,-0.033893,-0.031547,-0.012798,-0.035613,-0.034903,-0.021033


#### Sequences and Offsets of Dates

In [24]:
# Specifying start and end dates with frequency
pd.date_range(start="2018-01-20", end="2018-01-23", freq="D").astype(str).tolist()

['2018-01-20', '2018-01-21', '2018-01-22', '2018-01-23']

In [25]:
# Specifying start and number of periods to generate in the given frequency
pd.date_range(start="2018-01-20", periods=4, freq="D").astype(str).tolist()

['2018-01-20', '2018-01-21', '2018-01-22', '2018-01-23']

In [26]:
# Generating a date sequence with every 2 days
pd.date_range(start="2018-01-20", periods=4, freq="2D").astype(str).tolist()

['2018-01-20', '2018-01-22', '2018-01-24', '2018-01-26']

In [27]:
# Generating a date sequence every month, but month start
pd.date_range(start="2018-01-20", periods=4, freq="MS").astype(str).tolist()

['2018-02-01', '2018-03-01', '2018-04-01', '2018-05-01']

In [28]:
# Add four days to the date range
(pd.date_range(start="2018-01-20", end="2018-01-23", freq="D") + pd.Timedelta(4, unit="D")).astype(str).tolist()

['2018-01-24', '2018-01-25', '2018-01-26', '2018-01-27']

In [29]:
# Add four weeks to the date range
(pd.date_range(start="2018-01-20", end="2018-01-23", freq="D") + pd.Timedelta(4, unit="W")).astype(str).tolist()

['2018-02-17', '2018-02-18', '2018-02-19', '2018-02-20']

### Resampling, Shifting, Rolling Window, & Expanding Window Operations

#### Resampling