<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Converting Strings to Dates

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Given a vector of strings representing dates and times, you want to transform them into time series data

date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])

[pd.to_datetime(date, format='%d-%m-%Y %I:%M %p') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [3]:
[pd.to_datetime(date, format='%d-%m-%Y %I:%M %p', errors="coerce") for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

#Handling Time Zones

In [4]:
#You have time series data and want to add or change time zone information.

pd.Timestamp('2017-05-01 06:00:00', tz="Europe/London")

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [5]:
date = pd.Timestamp("2017-05-01 06:00:00")

date_in_london = date.tz_localize("Europe/London")

date_in_london

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [8]:
date_in_london.tz_convert("Africa/Abidjan")

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

In [10]:
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))

dates.dt.tz_localize('Africa/Abidjan')

  dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))


Unnamed: 0,0
0,2002-02-28 00:00:00+00:00
1,2002-03-31 00:00:00+00:00
2,2002-04-30 00:00:00+00:00


In [11]:
from pytz import all_timezones

all_timezones[0:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

#Selecting Dates and Times

In [15]:
#You have a vector of dates and you want to select one or more.

df = pd.DataFrame()

df["date"] = pd.date_range('1/1/2001', periods=100000, freq='h')

df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]

Unnamed: 0,date
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


In [16]:
df = df.set_index(df["date"])
df.loc['2002-1-1 01:00:00' : '2002-1-1 04:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


#Breaking Up Date Data into Multiple Features

In [19]:
df = pd.DataFrame()

df["date"] = pd.date_range('1/1/2001', periods=150, freq='W')

df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour
df["minute"] = df["date"].dt.minute

df.head()

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0


#Calculating the Difference Between Dates

In [24]:
# time between them for each observation.

df = pd.DataFrame()

df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]

df["Left"] - df["Arrived"]

Unnamed: 0,0
0,0 days
1,2 days


In [25]:
pd.Series(delta.days for delta in (df["Left"] - df["Arrived"]))

Unnamed: 0,0
0,0
1,2


#Encoding Days of the Week

In [26]:
#ou have a vector of dates and want to know the day of the week for each date.

dates = pd.Series(pd.date_range("2/2/2002", periods=3, freq="M"))

dates.dt.day_name()

  dates = pd.Series(pd.date_range("2/2/2002", periods=3, freq="M"))


Unnamed: 0,0
0,Thursday
1,Sunday
2,Tuesday


In [27]:
dates.dt.weekday

Unnamed: 0,0
0,3
1,6
2,1


#Creating a Lagged Feature

In [29]:
#You want to create a feature that is lagged n time periods.

df = pd.DataFrame()

df["dates"] = pd.date_range("1/1/2001", periods=5, freq="D")
df["stock_price"] = [1.1, 2.2, 3.3, 4.4, 5.5]

df["previous_days_stock_price"] = df["stock_price"].shift(1)
df

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2001-01-01,1.1,
1,2001-01-02,2.2,1.1
2,2001-01-03,3.3,2.2
3,2001-01-04,4.4,3.3
4,2001-01-05,5.5,4.4


#Using Rolling Time Windows

In [30]:
#Given time series data, you want to calculate a statistic for a rolling time.

time_index = pd.date_range("01/01/2010", periods=5, freq="M")

df = pd.DataFrame(index=time_index)

df["Stock_Price"] = [1,2,3,4,5]

df.rolling(window=2).mean()

  time_index = pd.date_range("01/01/2010", periods=5, freq="M")


Unnamed: 0,Stock_Price
2010-01-31,
2010-02-28,1.5
2010-03-31,2.5
2010-04-30,3.5
2010-05-31,4.5


#Handling Missing Data in Time Series

In [32]:
time_index = pd.date_range('01/01/2010', periods=5, freq="M")

df = pd.DataFrame(index=time_index)

df["Sales"] = [1.0, 2.0, np.nan, np.nan, 5.0]

df.interpolate()

  time_index = pd.date_range('01/01/2010', periods=5, freq="M")


Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [34]:
df.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [35]:
df.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


In [36]:
df.interpolate(method="quadratic")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.059808
2010-04-30,4.038069
2010-05-31,5.0


In [38]:
df.interpolate(limit=1, limit_direction="forward")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0
