# Computing relative dates in pandas

In [4]:
import pandas as pd
import numpy as np
from pandas.util.testing import makeTimeDataFrame

In [5]:
# Dummy dataframe with date as index
df = makeTimeDataFrame()

# Add a random ID-column
df['ID'] = np.random.randint(1, 4, df.shape[0])
# Drop unnecessary columns
df = df.drop(['A', 'B', 'C', 'D'], axis = 1)
df.head()

Unnamed: 0,ID
2000-01-03,1
2000-01-04,3
2000-01-05,2
2000-01-06,1
2000-01-07,3


In [7]:
# Drop 30% of the rows
df = df.sample(frac = .7)
len(df)

15

In [8]:
# put the date in it's own column
df['date'] = df.index

# reset the index
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,ID,date
0,2,2000-01-13
1,1,2000-01-24
2,2,2000-01-20
3,3,2000-01-27
4,3,2000-01-17


In [9]:
df = df.sort_values(by = ['date'])

In [10]:
def do_date_differences(row, col, date_col = 'date'):
    first = df.loc[df[col].isin([row[col]])].iloc[0][date_col]
    second = row[date_col]
    diff = second - first
    return diff


# axis = 1, jotta käsittelee rivejä
df['difference'] = df.apply(lambda row: do_date_differences(row, 'ID', 'date'), axis=1)
df

Unnamed: 0,ID,date,difference
13,1,2000-01-03,0 days
12,3,2000-01-04,0 days
11,2,2000-01-05,0 days
14,3,2000-01-10,6 days
0,2,2000-01-13,8 days
7,2,2000-01-14,9 days
4,3,2000-01-17,13 days
10,1,2000-01-18,15 days
8,3,2000-01-19,15 days
2,2,2000-01-20,15 days


In [12]:
df[df['ID'] == 1 ]

Unnamed: 0,ID,date,difference
13,1,2000-01-03,0 days
10,1,2000-01-18,15 days
1,1,2000-01-24,21 days
6,1,2000-02-10,38 days
5,1,2000-02-11,39 days


In [16]:
df[df['difference'] > '15 days']

Unnamed: 0,ID,date,difference
1,1,2000-01-24,21 days
3,3,2000-01-27,23 days
9,3,2000-02-09,36 days
6,1,2000-02-10,38 days
5,1,2000-02-11,39 days


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 13 to 5
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype          
---  ------      --------------  -----          
 0   ID          15 non-null     int64          
 1   date        15 non-null     datetime64[ns] 
 2   difference  15 non-null     timedelta64[ns]
dtypes: datetime64[ns](1), int64(1), timedelta64[ns](1)
memory usage: 480.0 bytes
