# Data Programming in Python | BAIS:6040
# Data Manipulation & Dates in Pandas 

### Adding Columns - Feature Engineering

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(data = np.random.normal(0,1,(100,2))
                  ,columns = ['Num1','Num2'])
df.head()

#### Add a Boolean Column

In [None]:
df['Num1Larger'] = df.Num1 > df.Num2

df.head()

#### Add New Column Using Vectorization

In [None]:
df['Difference'] = abs(df.Num1-df.Num2)

df.head()

#### Add New Column Using List Comprehension

In [None]:
df['Difference2'] = [abs(df.Num1[i]-df.Num2[i]) for i in range(len(df))]

df.head()

#### Add New Conditional as String

https://numpy.org/doc/1.18/reference/generated/numpy.where.html

In [None]:
df['LargerCategory'] = np.where(df.Num1 >= df.Num2, "Num1Bigger", "Num2Bigger")

df.head()

In [None]:
df.info()

#### Add New Conditional Column as Pandas Category

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html

In [None]:
df['LargerCategory'] = pd.Categorical(np.where(df.Num1 >= df.Num2, "Num1Bigger", "Num2Bigger"))

df.head()

In [None]:
df.info()

### Example of Widening the Data into Tidy / Panel Data 

- Oftentimes you have data that is not in a Tidy or Panel Data format.
- This generally happens when using SQL to query data from a relational database that is normalized.

https://en.wikipedia.org/wiki/Tidy_data

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'pie_rates.csv')
df = pd.read_csv(filepath)

df.head(10)

#### Pandas pivot_table - Similar to spread() in R

- Organizing the data around the type of pie with a separate Rate column for each Country
- Reset the index in-place so that Pie is it's own column and not an index
- Setting the row and column indices to None so they don't display

In [None]:
# organizing the data around the type of pie with a separate Rate column for each Country
df1=pd.pivot_table(df,index='Pie',columns='Country',values='Rate')

# reset the index in-place so that Pie is it's own column and not an index
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None

# notice the null values created
df1

### Example of Narrowing the Data 
#### Pandas Melt - Similar to gather() in R

- Gather the data back to the original so that Pie and Country are columns
- Drop the rows with nulls (in-place) to get back to the original

In [None]:
# gathering the data back to the original so that Pie and Country are columns
df2=pd.melt(df1, id_vars='Pie', var_name ='Country', value_name='Rate')

# notice the null values are still present
df2.head(10)

In [None]:
# drop the rows with nulls (in-place) to get back to the original
df2.dropna(inplace=True)

df2.head(10)

### New Pie Rates Dataset with a Date Added

In [None]:
import pandas as pd
import numpy as np
import os

filepath = os.path.join(os.getcwd(), 'data', 'pie_rates2.csv')
df = pd.read_csv(filepath)

df.index.name = None
df.head()

### Use Aggregations When Widening
* Can use several aggregation functions like mean, median, max, min
* Several options for which columns you want to persist and what you want the index column to be

In [None]:
# find the mean Rate by Date in each Country across all Pies
df1=pd.pivot_table(df,index='Date',columns='Country',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

In [None]:
# find the mean Rate by Date for each Pie across all Countries
df1=pd.pivot_table(df,index='Date',columns='Pie',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

In [None]:
# find the mean Rate by Pie in each Country, over all Dates
df1=pd.pivot_table(df,index='Pie',columns='Country',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

In [None]:
# find the mean Rate by Country for each Pie, over all Dates
df1=pd.pivot_table(df,index='Country',columns='Pie',values='Rate',aggfunc=np.mean)

# reset the index in-place
#df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

#### Multilevel Indices
* Axis 1 now has two indices; one for Country, and one for Pie
* Note: You can not change the index names to None.  They are Frozen.

In [None]:
# find the mean Rate by Date in each Country for each Pie
df1=pd.pivot_table(df,index='Date',columns=['Country','Pie'],values='Rate',aggfunc=np.mean)

# reset the index in-place
#df1.reset_index(inplace=True)

df1

### Appending & Concatenation

In [None]:
import pandas as pd

df1 = pd.DataFrame([100,200,300,400]
                    ,index=['a','b','c','d']
                    ,columns=['A'])

df1

In [None]:
df2 = pd.DataFrame([200,150,500]
                    ,index=['f','b','d']
                    ,columns=['B'])

df2

#### Append data from df2 to df1 as new rows

- Returns a new dataframe, does not change df1
- Because the two dataframes do not have the same column indices (columns), it creates NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

In [None]:
df1.append(df2, sort=False)  # keeps the same row indices

In [None]:
df1.append(df2, sort=False, ignore_index=True)  # ignores the existing row indices and creates new default indices

#### Concatenate two dataframes along the row axis (axis = 0)

- Same result as using append
- Because the two dataframes do not have the same column indices (columns), it creates NaN's
- axis = 0 is the default

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [None]:
pd.concat((df1,df2), sort=False)  # keeps the same row indices

In [None]:
pd.concat((df1,df2), sort=False, ignore_index=True)  # ignores the existing row indices and creates new default indices

#### Append & Concatenate data from df2 to df1 as new rows

- Returns a new dataframe, does not change df1
- Because the two dataframes have the <b>column indices (columns)</b>, it does NOT create NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

In [None]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(100,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(10,2))
                  ,columns = ['Num1','Num2'])

In [None]:
print(df1.shape)
print(df2.shape)

In [None]:
df1.append(df2, sort=False)

In [None]:
pd.concat([df1,df2])  # can pass a list of dataframes like here or a tuple like above

#### Concatenate two dataframes along the column axis (axis = 1)

- Returns a new dataframe with columns of each dataframe
- If the two dataframes have the same row indices, it does not create NaN's by default
- You can specify the type of join. If outer join, then if all the row indices don't match, it creates NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [None]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(10,2))
                  ,columns = ['Num3','Num4'])

In [None]:
print(df1.shape)
print(df2.shape)

In [None]:
df1

In [None]:
df2

In [None]:
pd.concat([df1,df2], sort=False, axis=1, join='inner')  # all the row indices match

In [None]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

In [None]:
pd.concat([df1,df2], sort=False, axis=1)  # not all row indices match and join = 'outer' by default

In [None]:
pd.concat([df1,df2], sort=False, axis=1, join='inner')  # not all row indices match, but forcing inner join

### Joining

- Join two dataframes similiar to joining two tables in SQL
- Can specify the columns to 'join on' or default and use row indices
- Joins can be left, inner, or outer : left is the default

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html

In [None]:
import numpy as np
import pandas as pd

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

In [None]:
df1.join(df2)  # the rows from df1 are the base and left join to df2 creates NaN's where row indices don't match

In [None]:
df2.join(df1)  # the rows from df2 are the base and left join to df1

In [None]:
df1.join(df2, how='inner')  # only the rows with matching indices are returned

In [None]:
df1.join(df2, how='outer')  # all rows from both dataframes returned with matching indices consolidated

### Merging

- Typically takes place on a column that is shared between dataframes
- Can, however, be used in place of Join with columns that do not match
- The 'how' parameter works the same as in Join

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

In [None]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

num5 = pd.Series(np.arange(0, 10, 1))

df1['Num5'] = num5
df2['Num5'] = num5

In [None]:
df1

In [None]:
df2

In [None]:
pd.merge(df1,df2)  # matches on column and default 'how' is inner on row indices

In [None]:
pd.merge(df1,df2, how='left')  # matches on column and left join 'how' on row indices creates NaN's

In [None]:
pd.merge(df1,df2, how='outer')  # matches on column and outer join 'how' includes all row indices  & creates NaN's

In [None]:
# order of the dataframes dictates order of columns in the output
pd.merge(df2,df1, how='outer')  # matches on column and outer join 'how' includes all row indices  & creates NaN's

### Working with Dates in Pandas
* index_col parameter sets the index to the column reference
* parse_dates parameter indicates which column(s) to try and create date objects from
* squeeze parameter indicates whether to represent as a single series if possible

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'daily-minimum-temperatures.csv')
df = pd.read_csv(filepath)

df.head()

In [None]:
# notice the data types - Date column is just an Object
df.info()

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'daily-minimum-temperatures.csv')

# using the parse_dates argument and indicating what column to parse
df = pd.read_csv(filepath, parse_dates=['Date'])

df.head()

In [None]:
# notice the data types - Date column is now a datetime object
df.info()

#### Setting Additional Parameters on the Load

- Setting the index_col parameter to the column that should be in the index. In this case Date
- Setting the parse_dates to a boolean instead of a column indicates the operation should apply to the index
- Setting the squeeze parameter to True squeezes the one column dataframe down to type Series

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'daily-minimum-temperatures.csv')

# setting the index_col parameter to the column that should be in the index. In this case Date
# setting the parse_dates to a boolean instead of a column indicates the operation should apply to the index
df = pd.read_csv(filepath,header=0, index_col=0, parse_dates=True)

df.head()

In [None]:
# notice the data types - the index is a datetime object
df.info()

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'daily-minimum-temperatures.csv')

# setting the squeeze parameter to True squeezes the one column dataframe down to type Series
df = pd.read_csv(filepath,header=0, index_col=0, parse_dates=True, squeeze=True)

df.head()

In [None]:
# notice what class df is
print(type(df))

# notice the dtype of the index of the Series
df.index.values

### New Dataset with Timestamp values

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'sampleTs.csv')

# parsing the TimeStamp column as a date
df = pd.read_csv(filepath, parse_dates=['TimeStamp'])

df.head()

In [None]:
# notice the dtype of the TimeStamp column
df.info()

#### Each datetime object has the following properties
* month
* year
* hour
* day
* minute
* second

In [None]:
print("Month")
print(df.TimeStamp.dt.month.head(2))
print("Year")
print(df.TimeStamp.dt.year.head(2))
print("Hour")
print(df.TimeStamp.dt.hour.head(2))
print("Day")
print(df.TimeStamp.dt.day.head(2))
print("Minute")
print(df.TimeStamp.dt.minute.head(2))
print("Second")
print(df.TimeStamp.dt.second.head(2))

#### Can also find the minimum and maximum values

In [None]:
print('Maximum Month: ',df.TimeStamp.dt.month.max())

print("Minimum Timestamp: ",df.TimeStamp.min())

In [None]:
df.TimeStamp.dt.date.max()

#### Feature Engineering
* Add Month, Day, Year, Hour, Weekday (as integer), and BusDay (as boolean) Features
* Using List Comprehension

In [None]:
import numpy as np

df['Month'] = [df.TimeStamp[i].month for i in range(len(df.TimeStamp))]
df['Day'] = [df.TimeStamp[i].day for i in range(len(df.TimeStamp))]
df['Year'] = [df.TimeStamp[i].year for i in range(len(df.TimeStamp))]
df['Hour'] = [df.TimeStamp[i].hour for i in range(len(df.TimeStamp))]
df['Weekday'] = [df.TimeStamp.dt.date[i].weekday() for i in range(len(df.TimeStamp.dt.date))]
df['BusDay'] = np.is_busday(df.TimeStamp.dt.date)


In [None]:
df.head()

In [None]:
%%time
df['Month'] = [df.TimeStamp[i].month for i in range(len(df.TimeStamp))]
df['Day'] = [df.TimeStamp[i].day for i in range(len(df.TimeStamp))]
df['Year'] = [df.TimeStamp[i].year for i in range(len(df.TimeStamp))]
df['Hour'] = [df.TimeStamp[i].hour for i in range(len(df.TimeStamp))]
df['Weekday'] = [df.TimeStamp.dt.date[i].weekday() for i in range(len(df.TimeStamp.dt.date))]

#### Using vectorization

In [None]:
%%time
df['Month2'] = df.TimeStamp.dt.month
df['Day2'] = df.TimeStamp.dt.day
df['Year2'] = df.TimeStamp.dt.year
df['Hour2'] = df.TimeStamp.dt.hour
df['Weekday2'] = df.TimeStamp.dt.weekday

In [None]:
df.head()

#### Add WeekdayText Feature as a Categorical
* Using List Comprehension

In [None]:
%%time

# weekdays as a tuple
# Monday is day 0
days = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")

df['WeekdayText'] = pd.Categorical([days[df.TimeStamp.dt.date[i].weekday()] for i in range(len(df.TimeStamp.dt.date))])

#### Using day_name() & month_name()

In [None]:
%%time
df['WeekdayText2'] = pd.Categorical(df.TimeStamp.dt.day_name())

In [None]:
df['WeekdayText3'] = pd.Categorical(df.TimeStamp.dt.strftime('%A'))

In [None]:
df['MonthText'] = df.TimeStamp.dt.month_name()

In [None]:
df.head()

#### Add Qtr and QtrText Features

In [None]:
qtrs = ("First","Second","Third","Fourth")
df['Qtr'] = pd.Categorical(df.TimeStamp.dt.quarter)
df['QtrText'] = pd.Categorical([qtrs[df.Qtr[i]-1] for i in range(len(df.TimeStamp.dt.date))])

In [None]:
# notice the dtype of WeekdayText compared to MonthText
print(df.info())
df.head()

#### Working with Time Zones of a datetime
* First step is to localize - which means defining what time zone the original data is in
* A timezone's offset refers to how many hours the timezone is from Coordinated Universal Time (UTC)
* A naive datetime object contains no timezone information. 
* Check tzinfo of a datetime object. tzinfo will be set to None if the object is naive.

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'sampleTs.csv')

# parsing the TimeStamp column as a date
df = pd.read_csv(filepath, parse_dates=['TimeStamp'])

In [None]:
# notice the datetime is naive
print(df.TimeStamp[0].tzinfo)

In [None]:
# localizing to US Central Time Zone
df['TimeStamp'] = df.TimeStamp.dt.tz_localize('US/Central')

# now the datetime is aware
df.TimeStamp[0].tzinfo

In [None]:
# notice the attributes of TimeStamp
df.info()

In [None]:
# create a new datetime column that converts the original timestamp to UTC
df['TimeStamp_UTC'] = df['TimeStamp'].dt.tz_convert('UTC')

In [None]:
df.head(2)

#### Create a datetime index using date_range

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html

In [None]:
# 3 hour frequency
pd.date_range(start = '2019-01-01', end = '2019-01-02', freq = '3H')

In [None]:
# 1 day frequency
pd.date_range(start = '2019-01-01', end = '2019-01-05', freq = 'D')

In [None]:
# Annual on first day of year for 10 years
pd.date_range(start = '2019-01-01', periods = 10, freq = 'AS')

In [None]:
# add a new column to the dataframe that is a datetime index with a 3 hour interval
df['NewDatetime'] = pd.date_range(start = '2019-01-01', periods = len(df), freq = '3H')

In [None]:
print(df.info())
df.head()

In [None]:
df.tail()