<div style="line-height:0.5">
<h1 style="color:lightcoral"> Pandas basics 6 </h1>
</div>
<div style="line-height:1.5">
<div style="margin-top: -8px;">
<span style="display: inline-block;">
    <h3 style="color: lightblue; display: inline;">Keywords:</h3> time series + pd.from_dict() + np.nanmean() + pd.Timestamp() + pd.ffill()
</span>
</div>
</div>

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

<h2 style="color:lightcoral"> Example #1</h2>

In [2]:
# Create sample dataframe
data = {'DATE': ['08.06.2023', '07.06.2023', '06.06.2023', '05.06.2023', '04.06.2023', '22.12.2022', '08.06.2023', '07.06.2023', 
                '06.06.2023', '05.06.2023', '04.06.2023', '16.11.2022', '08.06.2023', '07.06.2023','06.06.2023', '05.06.2023','04.06.2023', '21.10.2021'],
        'CODE': ['APPLE', 'APPLE', 'APPLE', 'APPLE', 'APPLE', 'APPLE', 'HARLEY', 'HARLEY', 'HARLEY', 'HARLEY', 'HARLEY', 'HARLEY', 
                'META', 'META', 'META', 'META', 'META', 'META'],
        'CLOSE PRICE': [8.5, 7, 9.3, 12, 3, 25, 4, 2, 1, 5, 9.5, 4, 1.1, 2, 1, 15, 65, 5]}
df = pd.DataFrame(data)
df

Unnamed: 0,DATE,CODE,CLOSE PRICE
0,08.06.2023,APPLE,8.5
1,07.06.2023,APPLE,7.0
2,06.06.2023,APPLE,9.3
3,05.06.2023,APPLE,12.0
4,04.06.2023,APPLE,3.0
5,22.12.2022,APPLE,25.0
6,08.06.2023,HARLEY,4.0
7,07.06.2023,HARLEY,2.0
8,06.06.2023,HARLEY,1.0
9,05.06.2023,HARLEY,5.0


In [3]:
# Convert DATE column to datetime
df['DATE'] = pd.to_datetime(df['DATE'], format='%d.%m.%Y')

### Start dates for each code
start_dates = {'APPLE': datetime(2023, 6, 5),
                'HARLEY': datetime(2023, 6, 4),
                'META': datetime(2023, 6, 4)}

In [4]:
################## Calculate returns for each code and each time horizon
results = {}
for code in start_dates.keys():
    start_date = start_dates[code]
    filtered_df = df.loc[df['CODE']==code]
    if not filtered_df.empty:
        prices = filtered_df.loc[filtered_df['DATE']>=start_date, 'CLOSE PRICE'].values
        
        if len(prices) > 0:
            thirty_day_prices = prices[0:min(len(prices),30)]
            sixty_day_prices = prices[0:min(len(prices),60)]
            ninety_day_prices = prices[0:min(len(prices),90)]
            thirty_day_tot = (thirty_day_prices[-1] / thirty_day_prices[0]) - 1 if len(thirty_day_prices) > 1 else np.nan
            sixty_day_tot = (sixty_day_prices[-1] / sixty_day_prices[0]) - 1 if len(sixty_day_prices) > 1 else np.nan
            ninety_day_tot = (ninety_day_prices[-1] / ninety_day_prices[0]) - 1 if len(ninety_day_prices) > 1 else np.nan
            avg_thirty_day_return = np.nanmean(thirty_day_tot)
            avg_sixty_day_return = np.nanmean(sixty_day_tot)
            avg_ninety_day_return = np.nanmean(ninety_day_tot)
            results[code] = {'START DATE': start_date.strftime('%d.%m.%Y'),
                            'AVERAGE GAIN FOR 30 DAY FROM START DATE': f'{avg_thirty_day_return:.0%}' if not np.isnan(avg_thirty_day_return) else 'No data',
                            'AVERAGE GAIN FOR 60 DAY FROM START DATE': f'{avg_sixty_day_return:.0%}' if not np.isnan(avg_sixty_day_return) else 'No data',
                            'AVERAGE GAIN FOR 90 DAY FROM START DATE': f'{avg_ninety_day_return:.0%}' if not np.isnan(avg_ninety_day_return) else 'No data'}
        else:
            print(f"No data available for code {code} and start date {start_date.strftime('%d.%m.%Y')}")
    else:
        print(f"No data available for code {code}")

In [5]:
#### Create a new dataframe with the results
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.index.name = 'CODE'
results_df

Unnamed: 0_level_0,START DATE,AVERAGE GAIN FOR 30 DAY FROM START DATE,AVERAGE GAIN FOR 60 DAY FROM START DATE,AVERAGE GAIN FOR 90 DAY FROM START DATE
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
APPLE,05.06.2023,41%,41%,41%
HARLEY,04.06.2023,138%,138%,138%
META,04.06.2023,5809%,5809%,5809%


In [6]:
df.head()

<h2 style="color:lightcoral"> Example #2</h2>

In [33]:
# Create a time series
ts = pd.Timestamp('2022-01-01 00:00')
se1 = pd.Series(10000, index=[ts])
se1

2022-01-01    10000
dtype: int64

In [34]:
se1.index

DatetimeIndex(['2022-01-01'], dtype='datetime64[ns]', freq=None)

In [35]:
# Create Series with random values 
dts = pd.date_range('2022-01-01', '2022-12-31', freq='B')   #'B' to generate a datetime index with only business days (weekdays only).
ts = pd.Series(np.random.randn(len(dts)), index=dts)
ts.head()

2022-01-03    0.74
2022-01-04   -1.60
2022-01-05    0.30
2022-01-06   -0.75
2022-01-07   -0.31
Freq: B, dtype: float64

Available Frequency specifiers (for Pandas date_range)

- 'B' - Business days 
- 'D' - Calendar days
- 'W' - Weekly
- 'M' - End of month
- 'Q' - Quarter end
- 'A' - Year end

In [43]:
pd.set_option('display.max_rows', None)
ts

2022-01-03    7.38e-01
2022-01-04   -1.60e+00
2022-01-05    2.96e-01
2022-01-06   -7.55e-01
2022-01-07   -3.09e-01
2022-01-10   -1.55e+00
2022-01-11    1.67e+00
2022-01-12   -1.68e+00
2022-01-13    1.39e+00
2022-01-14   -9.03e-02
2022-01-17   -9.76e-01
2022-01-18    8.34e-01
2022-01-19    1.40e+00
2022-01-20    7.74e-02
2022-01-21    1.22e-01
2022-01-24   -2.61e-01
2022-01-25   -2.09e-02
2022-01-26    7.28e-02
2022-01-27    1.08e-01
2022-01-28   -8.27e-01
2022-01-31   -8.56e-01
2022-02-01   -2.01e-01
2022-02-02   -9.78e-01
2022-02-03   -5.83e-01
2022-02-04   -1.23e+00
2022-02-07    1.10e+00
2022-02-08    4.07e-01
2022-02-09    3.42e-01
2022-02-10    1.83e+00
2022-02-11   -5.21e-01
2022-02-14   -4.46e-01
2022-02-15   -4.71e-01
2022-02-16   -2.28e-01
2022-02-17   -9.15e-01
2022-02-18   -1.97e-01
2022-02-21   -2.49e-01
2022-02-22    1.25e+00
2022-02-23    1.26e-01
2022-02-24   -8.23e-01
2022-02-25   -5.13e-01
2022-02-28   -2.89e-01
2022-03-01   -1.87e+00
2022-03-02   -3.94e-01
2022-03-03 

In [42]:
for value in ts:
    print(value)

0.7378419954031566
-1.6001956301560334
0.29619811492582976
-0.7548011145190191
-0.30936192692618864
-1.5527383574945635
1.6658879849017847
-1.6849152228165876
1.3924371669740983
-0.09025444439196346
-0.9761042219995802
0.8344808333989452
1.3997409506241165
0.07739786829478416
0.12220268449997712
-0.2605832191766817
-0.020891708546271873
0.07276293385666244
0.10809097453525428
-0.8274051222126795
-0.8562819406742406
-0.20074634959158194
-0.9783463291052805
-0.5830775062201504
-1.2346856603973153
1.0956119451763395
0.40682286260189365
0.3424257610929548
1.8340316081929902
-0.5208007922077216
-0.44620745139212303
-0.47078063684482274
-0.22763466377018612
-0.9146998732862563
-0.19657939622279375
-0.24902246733242986
1.2469474958470603
0.12580922868708366
-0.8231458722480252
-0.513226120613669
-0.2885837910810507
-1.8696442279073813
-0.39390359145994147
0.22222147486044083
-0.010870010702883386
0.032466328444181305
0.20393501958280186
-0.7429556061238792
2.2671133289023833
0.600467636090705

In [49]:
# Select
sel_date = ts['2022-03-21']
sel_mon = ts['2022-04'].head()
sel_yea = ts['2022'].head()

print(sel_date)
print()
print(sel_mon)
print()
print(sel_yea)

-0.25488239318510275

2022-04-01    1.31
2022-04-04   -0.52
2022-04-05    0.77
2022-04-06   -0.02
2022-04-07    1.08
Freq: B, dtype: float64

2022-01-03    0.74
2022-01-04   -1.60
2022-01-05    0.30
2022-01-06   -0.75
2022-01-07   -0.31
Freq: B, dtype: float64


Leading and lagging are concepts used in time series analysis to look at relationships between the current value of a time series and past or future values.

+ Leading indicators are used to predict changes before they happen.
+ Lagging indicators are used to predict changes after they happen, like shifting the time series backwards in time. 
    + A lagged time series is a delayed version of the original series.

In [53]:
ts2k_jan = ts['2022-01'].copy()
ts2k_jan


2022-01-03    0.74
2022-01-04   -1.60
2022-01-05    0.30
2022-01-06   -0.75
2022-01-07   -0.31
2022-01-10   -1.55
2022-01-11    1.67
2022-01-12   -1.68
2022-01-13    1.39
2022-01-14   -0.09
2022-01-17   -0.98
2022-01-18    0.83
2022-01-19    1.40
2022-01-20    0.08
2022-01-21    0.12
2022-01-24   -0.26
2022-01-25   -0.02
2022-01-26    0.07
2022-01-27    0.11
2022-01-28   -0.83
2022-01-31   -0.86
Freq: B, dtype: float64

In [55]:
lags = ts2k_jan.shift(1)
lags

2022-01-03     NaN
2022-01-04    0.74
2022-01-05   -1.60
2022-01-06    0.30
2022-01-07   -0.75
2022-01-10   -0.31
2022-01-11   -1.55
2022-01-12    1.67
2022-01-13   -1.68
2022-01-14    1.39
2022-01-17   -0.09
2022-01-18   -0.98
2022-01-19    0.83
2022-01-20    1.40
2022-01-21    0.08
2022-01-24    0.12
2022-01-25   -0.26
2022-01-26   -0.02
2022-01-27    0.07
2022-01-28    0.11
2022-01-31   -0.83
Freq: B, dtype: float64

In [56]:
leads = ts2k_jan.shift(-1) 
leads

2022-01-03   -1.60
2022-01-04    0.30
2022-01-05   -0.75
2022-01-06   -0.31
2022-01-07   -1.55
2022-01-10    1.67
2022-01-11   -1.68
2022-01-12    1.39
2022-01-13   -0.09
2022-01-14   -0.98
2022-01-17    0.83
2022-01-18    1.40
2022-01-19    0.08
2022-01-20    0.12
2022-01-21   -0.26
2022-01-24   -0.02
2022-01-25    0.07
2022-01-26    0.11
2022-01-27   -0.83
2022-01-28   -0.86
2022-01-31     NaN
Freq: B, dtype: float64

In [54]:
# Lag => Show first and last few rows
ts2k_jan.shift(1).iloc[[0, 1, 2, -2, -1]] 

2022-01-03     NaN
2022-01-04    0.74
2022-01-05   -1.60
2022-01-28    0.11
2022-01-31   -0.83
dtype: float64

In [57]:
# Lag with date offset
lags2 = ts2k_jan.shift(2, freq=pd.Timedelta(days=1))
lags2

2022-01-05    0.74
2022-01-06   -1.60
2022-01-07    0.30
2022-01-08   -0.75
2022-01-09   -0.31
2022-01-12   -1.55
2022-01-13    1.67
2022-01-14   -1.68
2022-01-15    1.39
2022-01-16   -0.09
2022-01-19   -0.98
2022-01-20    0.83
2022-01-21    1.40
2022-01-22    0.08
2022-01-23    0.12
2022-01-26   -0.26
2022-01-27   -0.02
2022-01-28    0.07
2022-01-29    0.11
2022-01-30   -0.83
2022-02-02   -0.86
dtype: float64

In [58]:
# Lag day-to-day percentage change
lag_change = ts2k_jan.pct_change(1)
lag_change

2022-01-03     NaN
2022-01-04   -3.17
2022-01-05   -1.19
2022-01-06   -3.55
2022-01-07   -0.59
2022-01-10    4.02
2022-01-11   -2.07
2022-01-12   -2.01
2022-01-13   -1.83
2022-01-14   -1.06
2022-01-17    9.82
2022-01-18   -1.85
2022-01-19    0.68
2022-01-20   -0.94
2022-01-21    0.58
2022-01-24   -3.13
2022-01-25   -0.92
2022-01-26   -4.48
2022-01-27    0.49
2022-01-28   -8.65
2022-01-31    0.03
Freq: B, dtype: float64

In [60]:
# Correlate series with various lags:
corrs = [ts2k_jan.corr(ts2k_jan.shift(i)) for i in range(1,5)]
corrs

[-0.4471580565650897,
 0.2574152340351504,
 -0.08190493143447103,
 0.15284998614927153]

<h2 style="color:lightcoral"> Example #3 </h2>

In [61]:
# Create Time Series with random values 
ts3 = pd.Timestamp('2023-01-01 00:00')
se3 = pd.Series(10000, index=[ts3])
dts3 = pd.date_range('2023-01-01', '2023-12-31', freq='D')
ts3 = pd.Series(np.random.randn(len(dts3)), index=dts3)
ts3.head()

2023-01-01   -0.52
2023-01-02   -1.42
2023-01-03   -1.08
2023-01-04    0.97
2023-01-05   -0.23
Freq: D, dtype: float64

In [62]:
# Resampling 
ts31 = pd.Series(np.random.randn(len(dts3)), index=dts3)
ts31

2023-01-01    7.00e-02
2023-01-02    1.69e+00
2023-01-03   -1.42e+00
2023-01-04   -8.65e-02
2023-01-05    3.51e-01
2023-01-06    2.48e+00
2023-01-07    6.57e-01
2023-01-08    3.87e-01
2023-01-09    7.11e-01
2023-01-10    2.76e+00
2023-01-11   -2.15e-01
2023-01-12    1.91e-01
2023-01-13    1.31e+00
2023-01-14    7.00e-01
2023-01-15    1.36e+00
2023-01-16   -8.33e-01
2023-01-17    5.63e-01
2023-01-18    5.63e-01
2023-01-19   -6.60e-01
2023-01-20   -4.53e-01
2023-01-21    4.52e-01
2023-01-22    9.53e-01
2023-01-23    6.30e-01
2023-01-24   -7.60e-01
2023-01-25   -5.94e-01
2023-01-26    1.90e+00
2023-01-27    1.18e+00
2023-01-28    2.56e-01
2023-01-29    1.59e-01
2023-01-30   -9.01e-01
2023-01-31   -3.67e-01
2023-02-01   -8.45e-01
2023-02-02    8.63e-01
2023-02-03   -1.69e+00
2023-02-04    2.03e+00
2023-02-05    6.08e-01
2023-02-06    1.06e+00
2023-02-07   -7.66e-01
2023-02-08    1.75e+00
2023-02-09    4.34e-01
2023-02-10    1.54e-01
2023-02-11    1.00e+00
2023-02-12   -2.89e-02
2023-02-13 

In [70]:
grp3 = ts31.resample('M')

grp3, grp3.mean()

(<pandas.core.resample.DatetimeIndexResampler object at 0x7f39bb940340>,
 2023-01-31    4.20e-01
 2023-02-28    2.25e-01
 2023-03-31   -3.23e-02
 2023-04-30    2.68e-01
 2023-05-31   -2.06e-01
 2023-06-30    3.51e-01
 2023-07-31    2.61e-01
 2023-08-31    2.01e-01
 2023-09-30    2.55e-01
 2023-10-31    7.85e-03
 2023-11-30    2.19e-01
 2023-12-31    1.59e-01
 Freq: M, dtype: float64)

In [71]:
grp3.agg(
    monthly_mean='mean',
    monthly_std='std')

Unnamed: 0,monthly_mean,monthly_std
2023-01-31,0.42,0.98
2023-02-28,0.225,0.98
2023-03-31,-0.0323,1.18
2023-04-30,0.268,0.97
2023-05-31,-0.206,0.92
2023-06-30,0.351,1.09
2023-07-31,0.261,0.88
2023-08-31,0.201,0.93
2023-09-30,0.255,0.97
2023-10-31,0.00785,0.88


In [72]:
""" Filling Missing Data """

def make_series_32():
    dts = pd.date_range('2000-01-02', '2000-01-07', freq='D')
    rv = pd.Series(np.random.randn(len(dts)), index=dts)
    rv.iloc[[0, 2, 5]] = np.nan
    return rv

ts32 = make_series_32()
ts32

2000-01-02     NaN
2000-01-03   -0.90
2000-01-04     NaN
2000-01-05    0.66
2000-01-06    0.55
2000-01-07     NaN
Freq: D, dtype: float64

In [78]:
# Filling NaN Data 
ts33_ok = ts32.ffill()
ts33_ok

2000-01-02     NaN
2000-01-03   -0.90
2000-01-04   -0.90
2000-01-05    0.66
2000-01-06    0.55
2000-01-07    0.55
Freq: D, dtype: float64

In [79]:
# Filling NaN Backwards
ts33_ok = ts32.bfill()
ts33_ok

2000-01-02   -0.90
2000-01-03   -0.90
2000-01-04    0.66
2000-01-05    0.66
2000-01-06    0.55
2000-01-07     NaN
Freq: D, dtype: float64

In [None]:
# Filling specifying limits
def make_series_35():
    dts = pd.date_range('2000-01-01', '2000-01-05', freq='D')
    rv = pd.Series(np.random.randn(len(dts)), index=dts)
    rv[1:] = np.nan
    return rv

ts5 = make_series_35()
ts5

<h2 style="color:lightcoral"> Example #4 </h2>

In [8]:
pd.set_option('display.precision', 2)

In [13]:
# Read csv:
    # "parse_dates" specifies which columns should be parsed as datetime objects.
    # "usecols" specifies a subset of columns to load from the CSV. 
df2 = pd.read_csv('./data_pandas/speakers.csv', usecols=['date', 'adj_close'], parse_dates=['date'])
df2.head(20)

Unnamed: 0,date,adj_close
0,2016-04-12,2061.72
1,2016-04-11,2041.99
2,2016-04-08,2047.6
3,2016-04-07,2041.91
4,2016-04-06,2066.66
5,2016-04-05,2045.17
6,2016-04-04,2066.13
7,2016-04-01,2072.78
8,2016-03-31,2059.74
9,2016-03-30,2063.95


In [14]:
# Create a pandas Timestamp 
pd.Timestamp('2003-04-30 16:00')

Timestamp('2003-04-30 16:00:00')

In [15]:
""" Generate a an Index of 5 calendar days starting on January 1, 2006. """
pd.date_range(start='2006-01-01', periods=5, freq='D') 

DatetimeIndex(['2006-01-01', '2006-01-02', '2006-01-03', '2006-01-04',
               '2006-01-05'],
              dtype='datetime64[ns]', freq='D')

In [16]:
""" Get all days of the week starting on January 1, 2006 and ending on January 30, 2006. """
pd.date_range(start='2006-01-01', end='2006-01-30', freq='B')

DatetimeIndex(['2006-01-02', '2006-01-03', '2006-01-04', '2006-01-05',
               '2006-01-06', '2006-01-09', '2006-01-10', '2006-01-11',
               '2006-01-12', '2006-01-13', '2006-01-16', '2006-01-17',
               '2006-01-18', '2006-01-19', '2006-01-20', '2006-01-23',
               '2006-01-24', '2006-01-25', '2006-01-26', '2006-01-27',
               '2006-01-30'],
              dtype='datetime64[ns]', freq='B')

In [25]:
""" Create a Pandas Series named serieA from the adj_close column, using date as the index. """

serieA = pd.Series(df2.adj_close.array, index=df2.date, name='adj_close').sort_index() #n.b. => sort_index()!

type(serieA), serieA.head()

(pandas.core.series.Series,
 date
 1950-01-03    16.66
 1950-01-04    16.85
 1950-01-05    16.93
 1950-01-06    16.98
 1950-01-09    17.08
 Name: adj_close, dtype: float64)

In [19]:
len(serieA)

16676

In [21]:
""" Select January 20, 2006 from serieA. """

selection_way_1 = serieA['19950103'] 
# or
selection_way_2 = serieA['1995-01-03'] 
# or
selection_way_3 = serieA[datetime(1995, 1, 3)] 


selection_way_1, selection_way_2, selection_way_3

(459.109985, 459.109985, 459.109985)

In [22]:
mar_95 = serieA['1995-03']
mar_95

date
1995-03-01    485.65
1995-03-02    485.13
1995-03-03    485.42
1995-03-06    485.63
1995-03-07    482.12
1995-03-08    483.14
1995-03-09    483.16
1995-03-10    489.57
1995-03-13    490.05
1995-03-14    492.89
1995-03-15    491.88
1995-03-16    495.41
1995-03-17    495.52
1995-03-20    496.14
1995-03-21    495.07
1995-03-22    495.67
1995-03-23    495.95
1995-03-24    500.97
1995-03-27    503.20
1995-03-28    503.90
1995-03-29    503.12
1995-03-30    502.22
1995-03-31    500.71
Name: adj_close, dtype: float64

In [24]:
y_95 = serieA['1995']
y_95.head()

date
1995-01-03    459.11
1995-01-04    460.71
1995-01-05    460.34
1995-01-06    460.68
1995-01-09    460.83
Name: adj_close, dtype: float64

In [27]:
""" Calculate the day-over-day percent change in the values. 
shift() data by a specified number of periods.
"""
serie_day = serieA / serieA.shift(1) - 1
serie_day

date
1950-01-03         NaN
1950-01-04    1.14e-02
1950-01-05    4.75e-03
1950-01-06    2.95e-03
1950-01-09    5.89e-03
                ...   
2016-04-06    1.05e-02
2016-04-07   -1.20e-02
2016-04-08    2.79e-03
2016-04-11   -2.74e-03
2016-04-12    9.66e-03
Name: adj_close, Length: 16676, dtype: float64

In [29]:
""" Resample the data from daily to monthly  """
serie_mon = serie_day.resample('M').mean()
serie_mon

date
1950-01-31    1.18e-03
1950-02-28    5.63e-04
1950-03-31    1.87e-04
1950-04-30    2.02e-03
1950-05-31    2.05e-03
                ...   
2015-12-31   -7.39e-04
2016-01-31   -2.63e-03
2016-02-29   -1.42e-04
2016-03-31    2.93e-03
2016-04-30    1.52e-04
Freq: M, Name: adj_close, Length: 796, dtype: float64