# Data Programming in Python | BAIS:6040
# Data Manipulation & Dates in Pandas 

### Adding Columns - Feature Engineering

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame(data = np.random.normal(0,1,(100,2))
                  ,columns = ['Num1','Num2'])
df.head()

Unnamed: 0,Num1,Num2
0,-0.042914,1.347421
1,2.155527,1.013976
2,1.873069,-0.508366
3,0.174112,-1.558981
4,0.437243,0.269476


#### Add a Boolean Column

In [2]:
df['Num1Larger'] = df.Num1 > df.Num2

df.head()

Unnamed: 0,Num1,Num2,Num1Larger
0,-0.042914,1.347421,False
1,2.155527,1.013976,True
2,1.873069,-0.508366,True
3,0.174112,-1.558981,True
4,0.437243,0.269476,True


#### Add New Column Using Vectorization

In [3]:
df['Difference'] = abs(df.Num1-df.Num2)

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference
0,-0.042914,1.347421,False,1.390334
1,2.155527,1.013976,True,1.141551
2,1.873069,-0.508366,True,2.381435
3,0.174112,-1.558981,True,1.733093
4,0.437243,0.269476,True,0.167767


#### Add New Column Using List Comprehension

In [4]:
df['Difference2'] = [abs(df.Num1[i]-df.Num2[i]) for i in range(len(df))]

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference,Difference2
0,-0.042914,1.347421,False,1.390334,1.390334
1,2.155527,1.013976,True,1.141551,1.141551
2,1.873069,-0.508366,True,2.381435,2.381435
3,0.174112,-1.558981,True,1.733093,1.733093
4,0.437243,0.269476,True,0.167767,0.167767


#### Add New Conditional as String

https://numpy.org/doc/1.18/reference/generated/numpy.where.html

In [5]:
df['LargerCategory'] = np.where(df.Num1 >= df.Num2, "Num1Bigger", "Num2Bigger")

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference,Difference2,LargerCategory
0,-0.042914,1.347421,False,1.390334,1.390334,Num2Bigger
1,2.155527,1.013976,True,1.141551,1.141551,Num1Bigger
2,1.873069,-0.508366,True,2.381435,2.381435,Num1Bigger
3,0.174112,-1.558981,True,1.733093,1.733093,Num1Bigger
4,0.437243,0.269476,True,0.167767,0.167767,Num1Bigger


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Num1            100 non-null    float64
 1   Num2            100 non-null    float64
 2   Num1Larger      100 non-null    bool   
 3   Difference      100 non-null    float64
 4   Difference2     100 non-null    float64
 5   LargerCategory  100 non-null    object 
dtypes: bool(1), float64(4), object(1)
memory usage: 4.1+ KB


#### Add New Conditional Column as Pandas Category

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html

In [7]:
df['LargerCategory'] = pd.Categorical(np.where(df.Num1 >= df.Num2, "Num1Bigger", "Num2Bigger"))

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference,Difference2,LargerCategory
0,-0.042914,1.347421,False,1.390334,1.390334,Num2Bigger
1,2.155527,1.013976,True,1.141551,1.141551,Num1Bigger
2,1.873069,-0.508366,True,2.381435,2.381435,Num1Bigger
3,0.174112,-1.558981,True,1.733093,1.733093,Num1Bigger
4,0.437243,0.269476,True,0.167767,0.167767,Num1Bigger


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Num1            100 non-null    float64 
 1   Num2            100 non-null    float64 
 2   Num1Larger      100 non-null    bool    
 3   Difference      100 non-null    float64 
 4   Difference2     100 non-null    float64 
 5   LargerCategory  100 non-null    category
dtypes: bool(1), category(1), float64(4)
memory usage: 3.5 KB


### Example of Widening the Data into Tidy / Panel Data 

- Oftentimes you have data that is not in a Tidy or Panel Data format.
- This generally happens when using SQL to query data from a relational database that is normalized.

https://en.wikipedia.org/wiki/Tidy_data

In [9]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'pie_rates.csv')
df = pd.read_csv(filepath)

df.head(10)

Unnamed: 0,Pie,Country,Rate
0,Apple,Mexico,1.5
1,Pecan,Mexico,2.0
2,Pumpkin,Mexico,1.75
3,Apple,Puerto Rico,2.0
4,Pecan,Puerto Rico,2.25
5,Pumpkin,Puerto Rico,2.25
6,Peach,Puerto Rico,2.5
7,Apple,Dominican,1.35
8,Pecan,Dominican,1.5
9,Pumpkin,Dominican,1.85


#### Pandas pivot_table - Similar to spread() in R

- Organizing the data around the type of pie with a separate Rate column for each Country
- Reset the index in-place so that Pie is it's own column and not an index
- Setting the row and column indices to None so they don't display

In [10]:
# organizing the data around the type of pie with a separate Rate column for each Country
df1=pd.pivot_table(df,index='Pie',columns='Country',values='Rate')

# reset the index in-place so that Pie is it's own column and not an index
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None

# notice the null values created
df1

Unnamed: 0,Pie,Dominican,Mexico,Puerto Rico
0,Apple,1.35,1.5,2.0
1,Cherry,3.0,,
2,Peach,1.95,,2.5
3,Pecan,1.5,2.0,2.25
4,Pumpkin,1.85,1.75,2.25


### Example of Narrowing the Data 
#### Pandas Melt - Similar to gather() in R

- Gather the data back to the original so that Pie and Country are columns
- Drop the rows with nulls (in-place) to get back to the original

In [11]:
# gathering the data back to the original so that Pie and Country are columns
df2=pd.melt(df1, id_vars='Pie', var_name ='Country', value_name='Rate')

# notice the null values are still present
df2.head(10)

Unnamed: 0,Pie,Country,Rate
0,Apple,Dominican,1.35
1,Cherry,Dominican,3.0
2,Peach,Dominican,1.95
3,Pecan,Dominican,1.5
4,Pumpkin,Dominican,1.85
5,Apple,Mexico,1.5
6,Cherry,Mexico,
7,Peach,Mexico,
8,Pecan,Mexico,2.0
9,Pumpkin,Mexico,1.75


In [12]:
# drop the rows with nulls (in-place) to get back to the original
df2.dropna(inplace=True)

df2.head(10)

Unnamed: 0,Pie,Country,Rate
0,Apple,Dominican,1.35
1,Cherry,Dominican,3.0
2,Peach,Dominican,1.95
3,Pecan,Dominican,1.5
4,Pumpkin,Dominican,1.85
5,Apple,Mexico,1.5
8,Pecan,Mexico,2.0
9,Pumpkin,Mexico,1.75
10,Apple,Puerto Rico,2.0
12,Peach,Puerto Rico,2.5


### New Pie Rates Dataset with a Date Added

In [13]:
import pandas as pd
import numpy as np
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'pie_rates2.csv')
df = pd.read_csv(filepath)

df.index.name = None
df.head()

Unnamed: 0,Date,Pie,Country,Rate
0,01/01/2016,Apple,Mexico,1.5
1,01/01/2016,Pecan,Mexico,2.0
2,01/01/2016,Pumpkin,Mexico,1.75
3,01/01/2016,Apple,Puerto Rico,2.0
4,01/01/2016,Pecan,Puerto Rico,2.25


### Use Aggregations When Widening
* Can use several aggregation functions like mean, median, max, min
* Several options for which columns you want to persist and what you want the index column to be

In [14]:
# find the mean Rate by Date in each Country across all Pies
df1=pd.pivot_table(df,index='Date',columns='Country',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Date,Dominican,Mexico,Puerto Rico
0,01/01/2016,1.93,1.75,2.25
1,01/01/2017,2.123,1.925,2.475
2,01/01/2018,2.3353,2.1175,2.7225


In [15]:
# find the mean Rate by Date for each Pie across all Countries
df1=pd.pivot_table(df,index='Date',columns='Pie',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Date,Apple,Cherry,Peach,Pecan,Pumpkin
0,01/01/2016,1.616667,3.0,2.225,1.916667,1.95
1,01/01/2017,1.778333,3.3,2.4475,2.108333,2.145
2,01/01/2018,1.956167,3.63,2.69225,2.319167,2.3595


In [16]:
# find the mean Rate by Pie in each Country, over all Dates
df1=pd.pivot_table(df,index='Pie',columns='Country',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Pie,Dominican,Mexico,Puerto Rico
0,Apple,1.4895,1.655,2.206667
1,Cherry,3.31,,
2,Peach,2.1515,,2.758333
3,Pecan,1.655,2.206667,2.4825
4,Pumpkin,2.041167,1.930833,2.4825


In [17]:
# find the mean Rate by Country for each Pie, over all Dates
df1=pd.pivot_table(df,index='Country',columns='Pie',values='Rate',aggfunc=np.mean)

# reset the index in-place
#df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Apple,Cherry,Peach,Pecan,Pumpkin
Dominican,1.4895,3.31,2.1515,1.655,2.041167
Mexico,1.655,,,2.206667,1.930833
Puerto Rico,2.206667,,2.758333,2.4825,2.4825


#### Multilevel Indices
* Axis 1 now has two indices; one for Country, and one for Pie
* Note: You can not change the index names to None.  They are Frozen.

In [18]:
# find the mean Rate by Date in each Country for each Pie
df1=pd.pivot_table(df,index='Date',columns=['Country','Pie'],values='Rate',aggfunc=np.mean)

# reset the index in-place
#df1.reset_index(inplace=True)

df1

Country,Dominican,Dominican,Dominican,Dominican,Dominican,Mexico,Mexico,Mexico,Puerto Rico,Puerto Rico,Puerto Rico,Puerto Rico
Pie,Apple,Cherry,Peach,Pecan,Pumpkin,Apple,Pecan,Pumpkin,Apple,Peach,Pecan,Pumpkin
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
01/01/2016,1.35,3.0,1.95,1.5,1.85,1.5,2.0,1.75,2.0,2.5,2.25,2.25
01/01/2017,1.485,3.3,2.145,1.65,2.035,1.65,2.2,1.925,2.2,2.75,2.475,2.475
01/01/2018,1.6335,3.63,2.3595,1.815,2.2385,1.815,2.42,2.1175,2.42,3.025,2.7225,2.7225


### Appending & Concatenation

In [19]:
import pandas as pd

df1 = pd.DataFrame([100,200,300,400]
                    ,index=['a','b','c','d']
                    ,columns=['A'])

df1

Unnamed: 0,A
a,100
b,200
c,300
d,400


In [20]:
df2 = pd.DataFrame([200,150,500]
                    ,index=['f','b','d']
                    ,columns=['B'])

df2

Unnamed: 0,B
f,200
b,150
d,500


#### Append data from df2 to df1 as new rows

- Returns a new dataframe, does not change df1
- Because the two dataframes do not have the same column indices (columns), it creates NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

In [21]:
df1.append(df2, sort=False)  # keeps the same row indices

Unnamed: 0,A,B
a,100.0,
b,200.0,
c,300.0,
d,400.0,
f,,200.0
b,,150.0
d,,500.0


In [22]:
df1.append(df2, sort=False, ignore_index=True)  # ignores the existing row indices and creates new default indices

Unnamed: 0,A,B
0,100.0,
1,200.0,
2,300.0,
3,400.0,
4,,200.0
5,,150.0
6,,500.0


#### Concatenate two dataframes along the row axis (axis = 0)

- Same result as using append
- Because the two dataframes do not have the same column indices (columns), it creates NaN's
- axis = 0 is the default

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [23]:
pd.concat((df1,df2), sort=False)  # keeps the same row indices

Unnamed: 0,A,B
a,100.0,
b,200.0,
c,300.0,
d,400.0,
f,,200.0
b,,150.0
d,,500.0


In [24]:
pd.concat((df1,df2), sort=False, ignore_index=True)  # ignores the existing row indices and creates new default indices

Unnamed: 0,A,B
0,100.0,
1,200.0,
2,300.0,
3,400.0,
4,,200.0
5,,150.0
6,,500.0


#### Append & Concatenate data from df2 to df1 as new rows

- Returns a new dataframe, does not change df1
- Because the two dataframes have the <b>column indices (columns)</b>, it does NOT create NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

In [25]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(100,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(10,2))
                  ,columns = ['Num1','Num2'])

In [26]:
print(df1.shape)
print(df2.shape)

(100, 2)
(10, 2)


In [27]:
df1.append(df2, sort=False)

Unnamed: 0,Num1,Num2
0,10.425903,10.203859
1,9.670622,8.969642
2,12.874861,7.221134
3,9.151602,8.711921
4,9.433562,11.478293
...,...,...
5,-0.745843,-2.099009
6,-0.540702,-0.479019
7,-0.122360,0.413546
8,0.377258,-0.207006


In [28]:
pd.concat([df1,df2])  # can pass a list of dataframes like here or a tuple like above

Unnamed: 0,Num1,Num2
0,10.425903,10.203859
1,9.670622,8.969642
2,12.874861,7.221134
3,9.151602,8.711921
4,9.433562,11.478293
...,...,...
5,-0.745843,-2.099009
6,-0.540702,-0.479019
7,-0.122360,0.413546
8,0.377258,-0.207006


#### Concatenate two dataframes along the column axis (axis = 1)

- Returns a new dataframe with columns of each dataframe
- If the two dataframes have the same row indices, it does not create NaN's by default
- You can specify the type of join. If outer join, then if all the row indices don't match, it creates NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [29]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(10,2))
                  ,columns = ['Num3','Num4'])

In [30]:
print(df1.shape)
print(df2.shape)

(10, 2)
(10, 2)


In [31]:
df1

Unnamed: 0,Num1,Num2
0,9.18454,12.457976
1,10.479741,9.755805
2,9.701592,10.059214
3,8.11688,13.061606
4,8.951068,9.269659
5,11.635332,9.547874
6,9.223427,7.540923
7,12.256096,13.452048
8,7.924536,9.832012
9,11.396307,12.563873


In [32]:
df2

Unnamed: 0,Num3,Num4
0,0.255966,1.805137
1,-1.47187,-1.299601
2,0.09099,-0.381069
3,-0.252237,0.497302
4,-0.699436,-2.3652
5,-0.782921,1.29643
6,-0.610778,-0.372696
7,0.30043,0.576173
8,-0.498935,-1.11445
9,0.475249,0.359991


In [33]:
pd.concat([df1,df2], sort=False, axis=1, join='inner')  # all the row indices match

Unnamed: 0,Num1,Num2,Num3,Num4
0,9.18454,12.457976,0.255966,1.805137
1,10.479741,9.755805,-1.47187,-1.299601
2,9.701592,10.059214,0.09099,-0.381069
3,8.11688,13.061606,-0.252237,0.497302
4,8.951068,9.269659,-0.699436,-2.3652
5,11.635332,9.547874,-0.782921,1.29643
6,9.223427,7.540923,-0.610778,-0.372696
7,12.256096,13.452048,0.30043,0.576173
8,7.924536,9.832012,-0.498935,-1.11445
9,11.396307,12.563873,0.475249,0.359991


In [34]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

In [35]:
pd.concat([df1,df2], sort=False, axis=1)  # not all row indices match and join = 'outer' by default

Unnamed: 0,Num1,Num2,Num3,Num4
0,11.334878,8.141522,-0.498105,-1.167849
1,5.741389,9.142745,-0.936555,-0.160456
2,8.819448,9.36811,0.626652,0.522584
3,8.303869,12.553809,-0.577403,-0.965122
4,9.273375,9.350454,0.048255,0.790734
5,8.191908,12.303333,,
6,12.050925,7.009727,,
7,11.233336,8.452198,,
8,11.823449,9.743658,,
9,12.744256,7.014113,,


In [36]:
pd.concat([df1,df2], sort=False, axis=1, join='inner')  # not all row indices match, but forcing inner join

Unnamed: 0,Num1,Num2,Num3,Num4
0,11.334878,8.141522,-0.498105,-1.167849
1,5.741389,9.142745,-0.936555,-0.160456
2,8.819448,9.36811,0.626652,0.522584
3,8.303869,12.553809,-0.577403,-0.965122
4,9.273375,9.350454,0.048255,0.790734


### Joining

- Join two dataframes similiar to joining two tables in SQL
- Can specify the columns to 'join on' or default and use row indices
- Joins can be left, inner, or outer : left is the default

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html

In [37]:
import numpy as np
import pandas as pd

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

In [38]:
df1.join(df2)  # the rows from df1 are the base and left join to df2 creates NaN's where row indices don't match

Unnamed: 0,Num1,Num2,Num3,Num4
0,10.888108,7.691977,-0.791807,0.415746
1,8.968813,8.896779,-0.467511,-2.486387
2,8.578748,7.648091,-0.13727,-0.625038
3,8.8866,9.061923,-0.157637,1.366854
4,14.410276,6.835795,-0.65692,-0.286145
5,6.575664,9.790227,,
6,13.23694,9.950196,,
7,9.825288,8.211274,,
8,12.114671,10.896269,,
9,12.659,12.146578,,


In [39]:
df2.join(df1)  # the rows from df2 are the base and left join to df1

Unnamed: 0,Num3,Num4,Num1,Num2
0,-0.791807,0.415746,10.888108,7.691977
1,-0.467511,-2.486387,8.968813,8.896779
2,-0.13727,-0.625038,8.578748,7.648091
3,-0.157637,1.366854,8.8866,9.061923
4,-0.65692,-0.286145,14.410276,6.835795


In [40]:
df1.join(df2, how='inner')  # only the rows with matching indices are returned

Unnamed: 0,Num1,Num2,Num3,Num4
0,10.888108,7.691977,-0.791807,0.415746
1,8.968813,8.896779,-0.467511,-2.486387
2,8.578748,7.648091,-0.13727,-0.625038
3,8.8866,9.061923,-0.157637,1.366854
4,14.410276,6.835795,-0.65692,-0.286145


In [41]:
df1.join(df2, how='outer')  # all rows from both dataframes returned with matching indices consolidated

Unnamed: 0,Num1,Num2,Num3,Num4
0,10.888108,7.691977,-0.791807,0.415746
1,8.968813,8.896779,-0.467511,-2.486387
2,8.578748,7.648091,-0.13727,-0.625038
3,8.8866,9.061923,-0.157637,1.366854
4,14.410276,6.835795,-0.65692,-0.286145
5,6.575664,9.790227,,
6,13.23694,9.950196,,
7,9.825288,8.211274,,
8,12.114671,10.896269,,
9,12.659,12.146578,,


### Merging

- Typically takes place on a column that is shared between dataframes
- Can, however, be used in place of Join with columns that do not match
- The 'how' parameter works the same as in Join

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

In [42]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

num5 = pd.Series(np.arange(0, 10, 1))

df1['Num5'] = num5
df2['Num5'] = num5

In [43]:
df1

Unnamed: 0,Num1,Num2,Num5
0,6.554148,14.928456,0
1,9.756356,12.036015,1
2,11.046628,8.253549,2
3,10.615844,10.517485,3
4,8.589998,10.127121,4
5,10.959809,9.770559,5
6,11.478506,10.364682,6
7,13.090275,10.400495,7
8,10.938616,9.441373,8
9,9.048709,8.863683,9


In [44]:
df2

Unnamed: 0,Num3,Num4,Num5
0,-0.479412,-0.254541,0
1,0.219985,0.898743,1
2,0.973882,0.512385,2
3,-0.400534,0.839484,3
4,-0.471814,0.905861,4


In [45]:
pd.merge(df1,df2)  # matches on column and default 'how' is inner on row indices

Unnamed: 0,Num1,Num2,Num5,Num3,Num4
0,6.554148,14.928456,0,-0.479412,-0.254541
1,9.756356,12.036015,1,0.219985,0.898743
2,11.046628,8.253549,2,0.973882,0.512385
3,10.615844,10.517485,3,-0.400534,0.839484
4,8.589998,10.127121,4,-0.471814,0.905861


In [46]:
pd.merge(df1,df2, how='left')  # matches on column and left join 'how' on row indices creates NaN's

Unnamed: 0,Num1,Num2,Num5,Num3,Num4
0,6.554148,14.928456,0,-0.479412,-0.254541
1,9.756356,12.036015,1,0.219985,0.898743
2,11.046628,8.253549,2,0.973882,0.512385
3,10.615844,10.517485,3,-0.400534,0.839484
4,8.589998,10.127121,4,-0.471814,0.905861
5,10.959809,9.770559,5,,
6,11.478506,10.364682,6,,
7,13.090275,10.400495,7,,
8,10.938616,9.441373,8,,
9,9.048709,8.863683,9,,


In [47]:
pd.merge(df1,df2, how='outer')  # matches on column and outer join 'how' includes all row indices  & creates NaN's

Unnamed: 0,Num1,Num2,Num5,Num3,Num4
0,6.554148,14.928456,0,-0.479412,-0.254541
1,9.756356,12.036015,1,0.219985,0.898743
2,11.046628,8.253549,2,0.973882,0.512385
3,10.615844,10.517485,3,-0.400534,0.839484
4,8.589998,10.127121,4,-0.471814,0.905861
5,10.959809,9.770559,5,,
6,11.478506,10.364682,6,,
7,13.090275,10.400495,7,,
8,10.938616,9.441373,8,,
9,9.048709,8.863683,9,,


In [48]:
# order of the dataframes dictates order of columns in the output
pd.merge(df2,df1, how='outer')  # matches on column and outer join 'how' includes all row indices  & creates NaN's

Unnamed: 0,Num3,Num4,Num5,Num1,Num2
0,-0.479412,-0.254541,0,6.554148,14.928456
1,0.219985,0.898743,1,9.756356,12.036015
2,0.973882,0.512385,2,11.046628,8.253549
3,-0.400534,0.839484,3,10.615844,10.517485
4,-0.471814,0.905861,4,8.589998,10.127121
5,,,5,10.959809,9.770559
6,,,6,11.478506,10.364682
7,,,7,13.090275,10.400495
8,,,8,10.938616,9.441373
9,,,9,9.048709,8.863683


### Working with Dates in Pandas
* index_col parameter sets the index to the column reference
* parse_dates parameter indicates which column(s) to try and create date objects from
* squeeze parameter indicates whether to represent as a single series if possible

In [49]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'daily-minimum-temperatures.csv')
df = pd.read_csv(filepath)

df.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [50]:
# notice the data types - Date column is just an Object
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    3650 non-null   object 
 1   Temp    3650 non-null   float64
dtypes: float64(1), object(1)
memory usage: 57.2+ KB


In [51]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'daily-minimum-temperatures.csv')

# using the parse_dates argument and indicating what column to parse
df = pd.read_csv(filepath, parse_dates=['Date'])

df.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [52]:
# notice the data types - Date column is now a datetime object
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3650 non-null   datetime64[ns]
 1   Temp    3650 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 57.2 KB


#### Setting Additional Parameters on the Load

- Setting the index_col parameter to the column that should be in the index. In this case Date
- Setting the parse_dates to a boolean instead of a column indicates the operation should apply to the index
- Setting the squeeze parameter to True squeezes the one column dataframe down to type Series

In [53]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'daily-minimum-temperatures.csv')

# setting the index_col parameter to the column that should be in the index. In this case Date
# setting the parse_dates to a boolean instead of a column indicates the operation should apply to the index
df = pd.read_csv(filepath,header=0, index_col=0, parse_dates=True)

df.head()

Unnamed: 0_level_0,Temp
Date,Unnamed: 1_level_1
1981-01-01,20.7
1981-01-02,17.9
1981-01-03,18.8
1981-01-04,14.6
1981-01-05,15.8


In [54]:
# notice the data types - the index is a datetime object
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3650 entries, 1981-01-01 to 1990-12-31
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Temp    3650 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB


In [55]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'daily-minimum-temperatures.csv')

# setting the squeeze parameter to True squeezes the one column dataframe down to type Series
df = pd.read_csv(filepath,header=0, index_col=0, parse_dates=True, squeeze=True)

df.head()

Date
1981-01-01    20.7
1981-01-02    17.9
1981-01-03    18.8
1981-01-04    14.6
1981-01-05    15.8
Name: Temp, dtype: float64

In [56]:
# notice what class df is
print(type(df))

# notice the dtype of the index of the Series
df.index.values

<class 'pandas.core.series.Series'>


array(['1981-01-01T00:00:00.000000000', '1981-01-02T00:00:00.000000000',
       '1981-01-03T00:00:00.000000000', ...,
       '1990-12-29T00:00:00.000000000', '1990-12-30T00:00:00.000000000',
       '1990-12-31T00:00:00.000000000'], dtype='datetime64[ns]')

### New Dataset with Timestamp values

In [57]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'sampleTs.csv')

# parsing the TimeStamp column as a date
df = pd.read_csv(filepath, parse_dates=['TimeStamp'])

df.head()

Unnamed: 0,TimeStamp,Num1,Num2,Num3
0,2017-09-14 19:44:00,-0.472482,1.759722,-0.198001
1,2017-09-14 19:45:00,-0.044162,1.174639,0.075704
2,2017-09-14 20:21:00,0.622071,1.223952,-2.342333
3,2017-09-14 21:08:00,-1.007011,-0.281759,0.363351
4,2017-09-14 22:04:00,0.028455,-1.027014,1.625157


In [58]:
# notice the dtype of the TimeStamp column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   TimeStamp  200 non-null    datetime64[ns]
 1   Num1       200 non-null    float64       
 2   Num2       200 non-null    float64       
 3   Num3       200 non-null    float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 6.4 KB


#### Each datetime object has the following properties
* month
* year
* hour
* day
* minute
* second

In [59]:
print("Month")
print(df.TimeStamp.dt.month.head(2))
print("Year")
print(df.TimeStamp.dt.year.head(2))
print("Hour")
print(df.TimeStamp.dt.hour.head(2))
print("Day")
print(df.TimeStamp.dt.day.head(2))
print("Minute")
print(df.TimeStamp.dt.minute.head(2))
print("Second")
print(df.TimeStamp.dt.second.head(2))

Month
0    9
1    9
Name: TimeStamp, dtype: int64
Year
0    2017
1    2017
Name: TimeStamp, dtype: int64
Hour
0    19
1    19
Name: TimeStamp, dtype: int64
Day
0    14
1    14
Name: TimeStamp, dtype: int64
Minute
0    44
1    45
Name: TimeStamp, dtype: int64
Second
0    0
1    0
Name: TimeStamp, dtype: int64


#### Can also find the minimum and maximum values

In [60]:
print('Maximum Month: ',df.TimeStamp.dt.month.max())

print("Minimum Timestamp: ",df.TimeStamp.min())

Maximum Month:  9
Minimum Timestamp:  2017-09-14 19:44:00


In [61]:
df.TimeStamp.dt.date.max()

datetime.date(2017, 9, 25)

#### Feature Engineering
* Add Month, Day, Year, Hour, Weekday (as integer), and BusDay (as boolean) Features
* Using List Comprehension

In [62]:
import numpy as np

df['Month'] = [df.TimeStamp[i].month for i in range(len(df.TimeStamp))]
df['Day'] = [df.TimeStamp[i].day for i in range(len(df.TimeStamp))]
df['Year'] = [df.TimeStamp[i].year for i in range(len(df.TimeStamp))]
df['Hour'] = [df.TimeStamp[i].hour for i in range(len(df.TimeStamp))]
df['Weekday'] = [df.TimeStamp.dt.date[i].weekday() for i in range(len(df.TimeStamp.dt.date))]
df['BusDay'] = np.is_busday(df.TimeStamp.dt.date)


In [63]:
df.head()

Unnamed: 0,TimeStamp,Num1,Num2,Num3,Month,Day,Year,Hour,Weekday,BusDay
0,2017-09-14 19:44:00,-0.472482,1.759722,-0.198001,9,14,2017,19,3,True
1,2017-09-14 19:45:00,-0.044162,1.174639,0.075704,9,14,2017,19,3,True
2,2017-09-14 20:21:00,0.622071,1.223952,-2.342333,9,14,2017,20,3,True
3,2017-09-14 21:08:00,-1.007011,-0.281759,0.363351,9,14,2017,21,3,True
4,2017-09-14 22:04:00,0.028455,-1.027014,1.625157,9,14,2017,22,3,True


In [64]:
%%time
df['Month'] = [df.TimeStamp[i].month for i in range(len(df.TimeStamp))]
df['Day'] = [df.TimeStamp[i].day for i in range(len(df.TimeStamp))]
df['Year'] = [df.TimeStamp[i].year for i in range(len(df.TimeStamp))]
df['Hour'] = [df.TimeStamp[i].hour for i in range(len(df.TimeStamp))]
df['Weekday'] = [df.TimeStamp.dt.date[i].weekday() for i in range(len(df.TimeStamp.dt.date))]

CPU times: user 38.4 ms, sys: 2.5 ms, total: 40.9 ms
Wall time: 40 ms


#### Using vectorization

In [65]:
%%time
df['Month2'] = df.TimeStamp.dt.month
df['Day2'] = df.TimeStamp.dt.day
df['Year2'] = df.TimeStamp.dt.year
df['Hour2'] = df.TimeStamp.dt.hour
df['Weekday2'] = df.TimeStamp.dt.weekday

CPU times: user 2.06 ms, sys: 29 µs, total: 2.09 ms
Wall time: 1.96 ms


In [66]:
df.head()

Unnamed: 0,TimeStamp,Num1,Num2,Num3,Month,Day,Year,Hour,Weekday,BusDay,Month2,Day2,Year2,Hour2,Weekday2
0,2017-09-14 19:44:00,-0.472482,1.759722,-0.198001,9,14,2017,19,3,True,9,14,2017,19,3
1,2017-09-14 19:45:00,-0.044162,1.174639,0.075704,9,14,2017,19,3,True,9,14,2017,19,3
2,2017-09-14 20:21:00,0.622071,1.223952,-2.342333,9,14,2017,20,3,True,9,14,2017,20,3
3,2017-09-14 21:08:00,-1.007011,-0.281759,0.363351,9,14,2017,21,3,True,9,14,2017,21,3
4,2017-09-14 22:04:00,0.028455,-1.027014,1.625157,9,14,2017,22,3,True,9,14,2017,22,3


#### Add WeekdayText Feature as a Categorical
* Using List Comprehension

In [67]:
%%time

# weekdays as a tuple
# Monday is day 0
days = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")

df['WeekdayText'] = pd.Categorical([days[df.TimeStamp.dt.date[i].weekday()] for i in range(len(df.TimeStamp.dt.date))])

CPU times: user 29.7 ms, sys: 1.33 ms, total: 31 ms
Wall time: 30 ms


#### Using day_name() & month_name()

In [68]:
%%time
df['WeekdayText2'] = pd.Categorical(df.TimeStamp.dt.day_name())

CPU times: user 866 µs, sys: 0 ns, total: 866 µs
Wall time: 810 µs


In [69]:
df['MonthText'] = df.TimeStamp.dt.month_name()

In [70]:
df.head()

Unnamed: 0,TimeStamp,Num1,Num2,Num3,Month,Day,Year,Hour,Weekday,BusDay,Month2,Day2,Year2,Hour2,Weekday2,WeekdayText,WeekdayText2,MonthText
0,2017-09-14 19:44:00,-0.472482,1.759722,-0.198001,9,14,2017,19,3,True,9,14,2017,19,3,Thursday,Thursday,September
1,2017-09-14 19:45:00,-0.044162,1.174639,0.075704,9,14,2017,19,3,True,9,14,2017,19,3,Thursday,Thursday,September
2,2017-09-14 20:21:00,0.622071,1.223952,-2.342333,9,14,2017,20,3,True,9,14,2017,20,3,Thursday,Thursday,September
3,2017-09-14 21:08:00,-1.007011,-0.281759,0.363351,9,14,2017,21,3,True,9,14,2017,21,3,Thursday,Thursday,September
4,2017-09-14 22:04:00,0.028455,-1.027014,1.625157,9,14,2017,22,3,True,9,14,2017,22,3,Thursday,Thursday,September


#### Add Qtr and QtrText Features

In [71]:
qtrs = ("First","Second","Third","Fourth")
df['Qtr'] = pd.Categorical(df.TimeStamp.dt.quarter)
df['QtrText'] = pd.Categorical([qtrs[df.Qtr[i]-1] for i in range(len(df.TimeStamp.dt.date))])

In [72]:
# notice the dtype of WeekdayText compared to MonthText
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   TimeStamp     200 non-null    datetime64[ns]
 1   Num1          200 non-null    float64       
 2   Num2          200 non-null    float64       
 3   Num3          200 non-null    float64       
 4   Month         200 non-null    int64         
 5   Day           200 non-null    int64         
 6   Year          200 non-null    int64         
 7   Hour          200 non-null    int64         
 8   Weekday       200 non-null    int64         
 9   BusDay        200 non-null    bool          
 10  Month2        200 non-null    int64         
 11  Day2          200 non-null    int64         
 12  Year2         200 non-null    int64         
 13  Hour2         200 non-null    int64         
 14  Weekday2      200 non-null    int64         
 15  WeekdayText   200 non-null    category  

Unnamed: 0,TimeStamp,Num1,Num2,Num3,Month,Day,Year,Hour,Weekday,BusDay,Month2,Day2,Year2,Hour2,Weekday2,WeekdayText,WeekdayText2,MonthText,Qtr,QtrText
0,2017-09-14 19:44:00,-0.472482,1.759722,-0.198001,9,14,2017,19,3,True,9,14,2017,19,3,Thursday,Thursday,September,3,Third
1,2017-09-14 19:45:00,-0.044162,1.174639,0.075704,9,14,2017,19,3,True,9,14,2017,19,3,Thursday,Thursday,September,3,Third
2,2017-09-14 20:21:00,0.622071,1.223952,-2.342333,9,14,2017,20,3,True,9,14,2017,20,3,Thursday,Thursday,September,3,Third
3,2017-09-14 21:08:00,-1.007011,-0.281759,0.363351,9,14,2017,21,3,True,9,14,2017,21,3,Thursday,Thursday,September,3,Third
4,2017-09-14 22:04:00,0.028455,-1.027014,1.625157,9,14,2017,22,3,True,9,14,2017,22,3,Thursday,Thursday,September,3,Third


#### Working with Time Zones of a datetime
* First step is to localize - which means defining what time zone the original data is in
* A timezone's offset refers to how many hours the timezone is from Coordinated Universal Time (UTC)
* A naive datetime object contains no timezone information. 
* Check tzinfo of a datetime object. tzinfo will be set to None if the object is naive.

In [73]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'sampleTs.csv')

# parsing the TimeStamp column as a date
df = pd.read_csv(filepath, parse_dates=['TimeStamp'])

In [74]:
# notice the datetime is naive
print(df.TimeStamp[0].tzinfo)

None


In [75]:
# localizing to US Central Time Zone
df['TimeStamp'] = df.TimeStamp.dt.tz_localize('US/Central')

# now the datetime is aware
df.TimeStamp[0].tzinfo

<DstTzInfo 'US/Central' CDT-1 day, 19:00:00 DST>

In [76]:
# notice the attributes of TimeStamp
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype                     
---  ------     --------------  -----                     
 0   TimeStamp  200 non-null    datetime64[ns, US/Central]
 1   Num1       200 non-null    float64                   
 2   Num2       200 non-null    float64                   
 3   Num3       200 non-null    float64                   
dtypes: datetime64[ns, US/Central](1), float64(3)
memory usage: 6.4 KB


In [77]:
# create a new datetime column that converts the original timestamp to UTC
df['TimeStamp_UTC'] = df['TimeStamp'].dt.tz_convert('UTC')

In [78]:
df.head(2)

Unnamed: 0,TimeStamp,Num1,Num2,Num3,TimeStamp_UTC
0,2017-09-14 19:44:00-05:00,-0.472482,1.759722,-0.198001,2017-09-15 00:44:00+00:00
1,2017-09-14 19:45:00-05:00,-0.044162,1.174639,0.075704,2017-09-15 00:45:00+00:00


#### Create a datetime index using date_range

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html

In [79]:
# 3 hour frequency
pd.date_range(start = '2019-01-01', end = '2019-01-02', freq = '3H')

DatetimeIndex(['2019-01-01 00:00:00', '2019-01-01 03:00:00',
               '2019-01-01 06:00:00', '2019-01-01 09:00:00',
               '2019-01-01 12:00:00', '2019-01-01 15:00:00',
               '2019-01-01 18:00:00', '2019-01-01 21:00:00',
               '2019-01-02 00:00:00'],
              dtype='datetime64[ns]', freq='3H')

In [80]:
# 1 day frequency
pd.date_range(start = '2019-01-01', end = '2019-01-05', freq = 'D')

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05'],
              dtype='datetime64[ns]', freq='D')

In [81]:
# Annual on first day of year for 10 years
pd.date_range(start = '2019-01-01', periods = 10, freq = 'AS')

DatetimeIndex(['2019-01-01', '2020-01-01', '2021-01-01', '2022-01-01',
               '2023-01-01', '2024-01-01', '2025-01-01', '2026-01-01',
               '2027-01-01', '2028-01-01'],
              dtype='datetime64[ns]', freq='AS-JAN')

In [82]:
# add a new column to the dataframe that is a datetime index with a 3 hour interval
df['NewDatetime'] = pd.date_range(start = '2019-01-01', periods = len(df), freq = '3H')

In [83]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype                     
---  ------         --------------  -----                     
 0   TimeStamp      200 non-null    datetime64[ns, US/Central]
 1   Num1           200 non-null    float64                   
 2   Num2           200 non-null    float64                   
 3   Num3           200 non-null    float64                   
 4   TimeStamp_UTC  200 non-null    datetime64[ns, UTC]       
 5   NewDatetime    200 non-null    datetime64[ns]            
dtypes: datetime64[ns, US/Central](1), datetime64[ns, UTC](1), datetime64[ns](1), float64(3)
memory usage: 9.5 KB
None


Unnamed: 0,TimeStamp,Num1,Num2,Num3,TimeStamp_UTC,NewDatetime
0,2017-09-14 19:44:00-05:00,-0.472482,1.759722,-0.198001,2017-09-15 00:44:00+00:00,2019-01-01 00:00:00
1,2017-09-14 19:45:00-05:00,-0.044162,1.174639,0.075704,2017-09-15 00:45:00+00:00,2019-01-01 03:00:00
2,2017-09-14 20:21:00-05:00,0.622071,1.223952,-2.342333,2017-09-15 01:21:00+00:00,2019-01-01 06:00:00
3,2017-09-14 21:08:00-05:00,-1.007011,-0.281759,0.363351,2017-09-15 02:08:00+00:00,2019-01-01 09:00:00
4,2017-09-14 22:04:00-05:00,0.028455,-1.027014,1.625157,2017-09-15 03:04:00+00:00,2019-01-01 12:00:00


In [84]:
df.tail()

Unnamed: 0,TimeStamp,Num1,Num2,Num3,TimeStamp_UTC,NewDatetime
195,2017-09-25 13:22:00-05:00,1.033363,-0.209361,0.058819,2017-09-25 18:22:00+00:00,2019-01-25 09:00:00
196,2017-09-25 14:33:00-05:00,1.335089,-0.214365,0.512149,2017-09-25 19:33:00+00:00,2019-01-25 12:00:00
197,2017-09-25 17:36:00-05:00,-0.408521,-1.867294,1.378888,2017-09-25 22:36:00+00:00,2019-01-25 15:00:00
198,2017-09-25 17:37:00-05:00,0.11059,0.295785,0.665763,2017-09-25 22:37:00+00:00,2019-01-25 18:00:00
199,2017-09-25 17:39:00-05:00,0.683409,0.370324,-0.238822,2017-09-25 22:39:00+00:00,2019-01-25 21:00:00
