# Data Programming in Python | BAIS:6040
# Data Manipulation & Dates in Pandas 

### Adding Columns - Feature Engineering

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame(data = np.random.normal(0,1,(100,2))
                  ,columns = ['Num1','Num2'])
df.head()

Unnamed: 0,Num1,Num2
0,-0.551262,-0.930067
1,2.382662,-1.509175
2,0.801054,0.062827
3,-1.82562,-0.446716
4,1.543718,0.149124


#### Add a Boolean Column

In [2]:
df['Num1Larger'] = df.Num1 > df.Num2

df.head()

Unnamed: 0,Num1,Num2,Num1Larger
0,-0.551262,-0.930067,True
1,2.382662,-1.509175,True
2,0.801054,0.062827,True
3,-1.82562,-0.446716,False
4,1.543718,0.149124,True


#### Add New Column Using Vectorization

In [3]:
df['Difference'] = abs(df.Num1-df.Num2)

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference
0,-0.551262,-0.930067,True,0.378806
1,2.382662,-1.509175,True,3.891836
2,0.801054,0.062827,True,0.738227
3,-1.82562,-0.446716,False,1.378904
4,1.543718,0.149124,True,1.394594


#### Add New Column Using List Comprehension

In [4]:
df['Difference2'] = [abs(df.Num1[i]-df.Num2[i]) for i in range(len(df))]

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference,Difference2
0,-0.551262,-0.930067,True,0.378806,0.378806
1,2.382662,-1.509175,True,3.891836,3.891836
2,0.801054,0.062827,True,0.738227,0.738227
3,-1.82562,-0.446716,False,1.378904,1.378904
4,1.543718,0.149124,True,1.394594,1.394594


#### Add New Conditional as String

https://numpy.org/doc/1.18/reference/generated/numpy.where.html

In [5]:
df['LargerCategory'] = np.where(df.Num1 >= df.Num2, "Num1Bigger", "Num2Bigger")

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference,Difference2,LargerCategory
0,-0.551262,-0.930067,True,0.378806,0.378806,Num1Bigger
1,2.382662,-1.509175,True,3.891836,3.891836,Num1Bigger
2,0.801054,0.062827,True,0.738227,0.738227,Num1Bigger
3,-1.82562,-0.446716,False,1.378904,1.378904,Num2Bigger
4,1.543718,0.149124,True,1.394594,1.394594,Num1Bigger


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Num1            100 non-null    float64
 1   Num2            100 non-null    float64
 2   Num1Larger      100 non-null    bool   
 3   Difference      100 non-null    float64
 4   Difference2     100 non-null    float64
 5   LargerCategory  100 non-null    object 
dtypes: bool(1), float64(4), object(1)
memory usage: 4.1+ KB


#### Add New Conditional Column as Pandas Category

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html

In [7]:
df['LargerCategory'] = pd.Categorical(np.where(df.Num1 >= df.Num2, "Num1Bigger", "Num2Bigger"))

df.head()

Unnamed: 0,Num1,Num2,Num1Larger,Difference,Difference2,LargerCategory
0,-0.551262,-0.930067,True,0.378806,0.378806,Num1Bigger
1,2.382662,-1.509175,True,3.891836,3.891836,Num1Bigger
2,0.801054,0.062827,True,0.738227,0.738227,Num1Bigger
3,-1.82562,-0.446716,False,1.378904,1.378904,Num2Bigger
4,1.543718,0.149124,True,1.394594,1.394594,Num1Bigger


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Num1            100 non-null    float64 
 1   Num2            100 non-null    float64 
 2   Num1Larger      100 non-null    bool    
 3   Difference      100 non-null    float64 
 4   Difference2     100 non-null    float64 
 5   LargerCategory  100 non-null    category
dtypes: bool(1), category(1), float64(4)
memory usage: 3.5 KB


### Example of Widening the Data into Tidy / Panel Data 

- Oftentimes you have data that is not in a Tidy or Panel Data format.
- This generally happens when using SQL to query data from a relational database that is normalized.

https://en.wikipedia.org/wiki/Tidy_data

In [9]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'pie_rates.csv')
df = pd.read_csv(filepath)

df.head(10)

Unnamed: 0,Pie,Country,Rate
0,Apple,Mexico,1.5
1,Pecan,Mexico,2.0
2,Pumpkin,Mexico,1.75
3,Apple,Puerto Rico,2.0
4,Pecan,Puerto Rico,2.25
5,Pumpkin,Puerto Rico,2.25
6,Peach,Puerto Rico,2.5
7,Apple,Dominican,1.35
8,Pecan,Dominican,1.5
9,Pumpkin,Dominican,1.85


#### Pandas pivot_table - Similar to spread() in R

- Organizing the data around the type of pie with a separate Rate column for each Country
- Reset the index in-place so that Pie is it's own column and not an index
- Setting the row and column indices to None so they don't display

In [10]:
# organizing the data around the type of pie with a separate Rate column for each Country
df1=pd.pivot_table(df,index='Pie',columns='Country',values='Rate')

# reset the index in-place so that Pie is it's own column and not an index
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None

# notice the null values created
df1

Unnamed: 0,Pie,Dominican,Mexico,Puerto Rico
0,Apple,1.35,1.5,2.0
1,Cherry,3.0,,
2,Peach,1.95,,2.5
3,Pecan,1.5,2.0,2.25
4,Pumpkin,1.85,1.75,2.25


### Example of Narrowing the Data 
#### Pandas Melt - Similar to gather() in R

- Gather the data back to the original so that Pie and Country are columns
- Drop the rows with nulls (in-place) to get back to the original

In [11]:
# gathering the data back to the original so that Pie and Country are columns
df2=pd.melt(df1, id_vars='Pie', var_name ='Country', value_name='Rate')

# notice the null values are still present
df2.head(10)

Unnamed: 0,Pie,Country,Rate
0,Apple,Dominican,1.35
1,Cherry,Dominican,3.0
2,Peach,Dominican,1.95
3,Pecan,Dominican,1.5
4,Pumpkin,Dominican,1.85
5,Apple,Mexico,1.5
6,Cherry,Mexico,
7,Peach,Mexico,
8,Pecan,Mexico,2.0
9,Pumpkin,Mexico,1.75


In [12]:
# drop the rows with nulls (in-place) to get back to the original
df2.dropna(inplace=True)

df2.head(10)

Unnamed: 0,Pie,Country,Rate
0,Apple,Dominican,1.35
1,Cherry,Dominican,3.0
2,Peach,Dominican,1.95
3,Pecan,Dominican,1.5
4,Pumpkin,Dominican,1.85
5,Apple,Mexico,1.5
8,Pecan,Mexico,2.0
9,Pumpkin,Mexico,1.75
10,Apple,Puerto Rico,2.0
12,Peach,Puerto Rico,2.5


### New Pie Rates Dataset with a Date Added

In [13]:
import pandas as pd
import numpy as np
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'pie_rates2.csv')
df = pd.read_csv(filepath)

df.index.name = None
df.head()

Unnamed: 0,Date,Pie,Country,Rate
0,01/01/2016,Apple,Mexico,1.5
1,01/01/2016,Pecan,Mexico,2.0
2,01/01/2016,Pumpkin,Mexico,1.75
3,01/01/2016,Apple,Puerto Rico,2.0
4,01/01/2016,Pecan,Puerto Rico,2.25


### Use Aggregations When Widening
* Can use several aggregation functions like mean, median, max, min
* Several options for which columns you want to persist and what you want the index column to be

In [14]:
# find the mean Rate by Date in each Country across all Pies
df1=pd.pivot_table(df,index='Date',columns='Country',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Date,Dominican,Mexico,Puerto Rico
0,01/01/2016,1.93,1.75,2.25
1,01/01/2017,2.123,1.925,2.475
2,01/01/2018,2.3353,2.1175,2.7225


In [15]:
# find the mean Rate by Date for each Pie across all Countries
df1=pd.pivot_table(df,index='Date',columns='Pie',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Date,Apple,Cherry,Peach,Pecan,Pumpkin
0,01/01/2016,1.616667,3.0,2.225,1.916667,1.95
1,01/01/2017,1.778333,3.3,2.4475,2.108333,2.145
2,01/01/2018,1.956167,3.63,2.69225,2.319167,2.3595


In [16]:
# find the mean Rate by Pie in each Country, over all Dates
df1=pd.pivot_table(df,index='Pie',columns='Country',values='Rate',aggfunc=np.mean)

# reset the index in-place
df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Pie,Dominican,Mexico,Puerto Rico
0,Apple,1.4895,1.655,2.206667
1,Cherry,3.31,,
2,Peach,2.1515,,2.758333
3,Pecan,1.655,2.206667,2.4825
4,Pumpkin,2.041167,1.930833,2.4825


In [17]:
# find the mean Rate by Country for each Pie, over all Dates
df1=pd.pivot_table(df,index='Country',columns='Pie',values='Rate',aggfunc=np.mean)

# reset the index in-place
#df1.reset_index(inplace=True)

# setting the row and column indices to None so they don't display
df1.index.name = None
df1.columns.name = None
 
df1

Unnamed: 0,Apple,Cherry,Peach,Pecan,Pumpkin
Dominican,1.4895,3.31,2.1515,1.655,2.041167
Mexico,1.655,,,2.206667,1.930833
Puerto Rico,2.206667,,2.758333,2.4825,2.4825


#### Multilevel Indices
* Axis 1 now has two indices; one for Country, and one for Pie
* Note: You can not change the index names to None.  They are Frozen.

In [18]:
# find the mean Rate by Date in each Country for each Pie
df1=pd.pivot_table(df,index='Date',columns=['Country','Pie'],values='Rate',aggfunc=np.mean)

# reset the index in-place
#df1.reset_index(inplace=True)

df1

Country,Dominican,Dominican,Dominican,Dominican,Dominican,Mexico,Mexico,Mexico,Puerto Rico,Puerto Rico,Puerto Rico,Puerto Rico
Pie,Apple,Cherry,Peach,Pecan,Pumpkin,Apple,Pecan,Pumpkin,Apple,Peach,Pecan,Pumpkin
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
01/01/2016,1.35,3.0,1.95,1.5,1.85,1.5,2.0,1.75,2.0,2.5,2.25,2.25
01/01/2017,1.485,3.3,2.145,1.65,2.035,1.65,2.2,1.925,2.2,2.75,2.475,2.475
01/01/2018,1.6335,3.63,2.3595,1.815,2.2385,1.815,2.42,2.1175,2.42,3.025,2.7225,2.7225


### Appending & Concatenation

In [19]:
import pandas as pd

df1 = pd.DataFrame([100,200,300,400]
                    ,index=['a','b','c','d']
                    ,columns=['A'])

df1

Unnamed: 0,A
a,100
b,200
c,300
d,400


In [20]:
df2 = pd.DataFrame([200,150,500]
                    ,index=['f','b','d']
                    ,columns=['B'])

df2

Unnamed: 0,B
f,200
b,150
d,500


#### Append data from df2 to df1 as new rows

- Returns a new dataframe, does not change df1
- Because the two dataframes do not have the same column indices (columns), it creates NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

In [21]:
df1.append(df2, sort=False)  # keeps the same row indices

Unnamed: 0,A,B
a,100.0,
b,200.0,
c,300.0,
d,400.0,
f,,200.0
b,,150.0
d,,500.0


In [22]:
df1.append(df2, sort=False, ignore_index=True)  # ignores the existing row indices and creates new default indices

Unnamed: 0,A,B
0,100.0,
1,200.0,
2,300.0,
3,400.0,
4,,200.0
5,,150.0
6,,500.0


#### Concatenate two dataframes along the row axis (axis = 0)

- Same result as using append
- Because the two dataframes do not have the same column indices (columns), it creates NaN's
- axis = 0 is the default

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [23]:
pd.concat((df1,df2), sort=False)  # keeps the same row indices

Unnamed: 0,A,B
a,100.0,
b,200.0,
c,300.0,
d,400.0,
f,,200.0
b,,150.0
d,,500.0


In [24]:
pd.concat((df1,df2), sort=False, ignore_index=True)  # ignores the existing row indices and creates new default indices

Unnamed: 0,A,B
0,100.0,
1,200.0,
2,300.0,
3,400.0,
4,,200.0
5,,150.0
6,,500.0


#### Append & Concatenate data from df2 to df1 as new rows

- Returns a new dataframe, does not change df1
- Because the two dataframes have the <b>column indices (columns)</b>, it does NOT create NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

In [25]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(100,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(10,2))
                  ,columns = ['Num1','Num2'])

In [26]:
print(df1.shape)
print(df2.shape)

(100, 2)
(10, 2)


In [27]:
df1.append(df2, sort=False)

Unnamed: 0,Num1,Num2
0,6.409921,8.627254
1,8.156715,11.759708
2,9.788690,11.761900
3,7.251868,7.271789
4,9.466621,11.146228
...,...,...
5,-0.736178,1.774169
6,-0.578804,-0.603419
7,-0.008932,-1.728193
8,0.498987,2.286262


In [28]:
pd.concat([df1,df2])  # can pass a list of dataframes like here or a tuple like above

Unnamed: 0,Num1,Num2
0,6.409921,8.627254
1,8.156715,11.759708
2,9.788690,11.761900
3,7.251868,7.271789
4,9.466621,11.146228
...,...,...
5,-0.736178,1.774169
6,-0.578804,-0.603419
7,-0.008932,-1.728193
8,0.498987,2.286262


#### Concatenate two dataframes along the column axis (axis = 1)

- Returns a new dataframe with columns of each dataframe
- If the two dataframes have the same row indices, it does not create NaN's by default
- You can specify the type of join. If outer join, then if all the row indices don't match, it creates NaN's

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [29]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(10,2))
                  ,columns = ['Num3','Num4'])

In [30]:
print(df1.shape)
print(df2.shape)

(10, 2)
(10, 2)


In [31]:
df1

Unnamed: 0,Num1,Num2
0,9.300144,9.754237
1,8.39233,8.993357
2,12.049503,8.42056
3,8.471564,10.908269
4,10.574447,7.976566
5,9.934802,7.34872
6,9.871109,10.550207
7,9.796792,11.555859
8,9.46123,9.981412
9,10.454466,9.740317


In [32]:
df2

Unnamed: 0,Num3,Num4
0,-1.133766,-1.344961
1,2.175319,0.468652
2,2.036633,-0.252069
3,1.568468,0.635171
4,-0.368016,-2.251043
5,-1.252108,1.301519
6,-0.604888,-1.037483
7,-0.978898,-0.773526
8,-1.215259,-0.96268
9,2.137624,0.189071


In [33]:
pd.concat([df1,df2], sort=False, axis=1, join='inner')  # all the row indices match

Unnamed: 0,Num1,Num2,Num3,Num4
0,9.300144,9.754237,-1.133766,-1.344961
1,8.39233,8.993357,2.175319,0.468652
2,12.049503,8.42056,2.036633,-0.252069
3,8.471564,10.908269,1.568468,0.635171
4,10.574447,7.976566,-0.368016,-2.251043
5,9.934802,7.34872,-1.252108,1.301519
6,9.871109,10.550207,-0.604888,-1.037483
7,9.796792,11.555859,-0.978898,-0.773526
8,9.46123,9.981412,-1.215259,-0.96268
9,10.454466,9.740317,2.137624,0.189071


In [34]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

In [35]:
pd.concat([df1,df2], sort=False, axis=1)  # not all row indices match and join = 'outer' by default

Unnamed: 0,Num1,Num2,Num3,Num4
0,11.572077,12.921245,-1.750183,1.313711
1,11.698279,9.424565,1.485156,0.539688
2,10.16536,8.193461,0.516237,-1.148075
3,10.967894,11.332762,-1.152471,1.083415
4,10.399021,13.080517,0.779912,-0.259362
5,6.422596,9.410414,,
6,9.483415,13.586421,,
7,11.351656,9.048714,,
8,10.679515,9.279081,,
9,8.044676,8.319145,,


In [36]:
pd.concat([df1,df2], sort=False, axis=1, join='inner')  # not all row indices match, but forcing inner join

Unnamed: 0,Num1,Num2,Num3,Num4
0,11.572077,12.921245,-1.750183,1.313711
1,11.698279,9.424565,1.485156,0.539688
2,10.16536,8.193461,0.516237,-1.148075
3,10.967894,11.332762,-1.152471,1.083415
4,10.399021,13.080517,0.779912,-0.259362


### Joining

- Join two dataframes similiar to joining two tables in SQL
- Can specify the columns to 'join on' or default and use row indices
- Joins can be left, inner, or outer : left is the default

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html

In [37]:
import numpy as np
import pandas as pd

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

In [38]:
df1.join(df2)  # the rows from df1 are the base and left join to df2 creates NaN's where row indices don't match

Unnamed: 0,Num1,Num2,Num3,Num4
0,4.3205,10.493474,0.497118,0.171377
1,10.33751,10.020628,-0.989456,-0.84728
2,10.142128,11.032059,-1.559934,-0.975094
3,11.246046,7.564296,-0.733232,0.485629
4,9.232593,9.956726,1.641954,-0.14446
5,11.50307,8.91398,,
6,12.460064,12.854896,,
7,9.937668,9.315852,,
8,5.405543,5.724441,,
9,12.927984,12.620737,,


In [39]:
df2.join(df1)  # the rows from df2 are the base and left join to df1

Unnamed: 0,Num3,Num4,Num1,Num2
0,0.497118,0.171377,4.3205,10.493474
1,-0.989456,-0.84728,10.33751,10.020628
2,-1.559934,-0.975094,10.142128,11.032059
3,-0.733232,0.485629,11.246046,7.564296
4,1.641954,-0.14446,9.232593,9.956726


In [40]:
df1.join(df2, how='inner')  # only the rows with matching indices are returned

Unnamed: 0,Num1,Num2,Num3,Num4
0,4.3205,10.493474,0.497118,0.171377
1,10.33751,10.020628,-0.989456,-0.84728
2,10.142128,11.032059,-1.559934,-0.975094
3,11.246046,7.564296,-0.733232,0.485629
4,9.232593,9.956726,1.641954,-0.14446


In [41]:
df1.join(df2, how='outer')  # all rows from both dataframes returned with matching indices consolidated

Unnamed: 0,Num1,Num2,Num3,Num4
0,4.3205,10.493474,0.497118,0.171377
1,10.33751,10.020628,-0.989456,-0.84728
2,10.142128,11.032059,-1.559934,-0.975094
3,11.246046,7.564296,-0.733232,0.485629
4,9.232593,9.956726,1.641954,-0.14446
5,11.50307,8.91398,,
6,12.460064,12.854896,,
7,9.937668,9.315852,,
8,5.405543,5.724441,,
9,12.927984,12.620737,,


### Merging

- Typically takes place on a column that is shared between dataframes
- Can, however, be used in place of Join with columns that do not match
- The 'how' parameter works the same as in Join

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

In [42]:
import numpy as np

df1 = pd.DataFrame(data = np.random.normal(10,2,(10,2))
                  ,columns = ['Num1','Num2'])

df2 = pd.DataFrame(data = np.random.normal(0,1,(5,2))
                  ,columns = ['Num3','Num4'])

num5 = pd.Series(np.arange(0, 10, 1))

df1['Num5'] = num5
df2['Num5'] = num5

In [43]:
df1

Unnamed: 0,Num1,Num2,Num5
0,11.658342,9.27865,0
1,10.393543,10.795164,1
2,8.557346,15.367598,2
3,8.951174,9.1052,3
4,10.247929,10.868042,4
5,4.391504,12.033087,5
6,12.40059,9.941493,6
7,11.508577,12.943454,7
8,8.200311,7.928159,8
9,12.738399,11.936196,9


In [44]:
df2

Unnamed: 0,Num3,Num4,Num5
0,-0.282933,-0.321274,0
1,-0.107121,-0.28515,1
2,-0.085814,0.388719,2
3,-1.254169,0.967458,3
4,0.275841,0.552301,4


In [45]:
pd.merge(df1,df2)  # matches on column and default 'how' is inner on row indices

Unnamed: 0,Num1,Num2,Num5,Num3,Num4
0,11.658342,9.27865,0,-0.282933,-0.321274
1,10.393543,10.795164,1,-0.107121,-0.28515
2,8.557346,15.367598,2,-0.085814,0.388719
3,8.951174,9.1052,3,-1.254169,0.967458
4,10.247929,10.868042,4,0.275841,0.552301


In [46]:
pd.merge(df1,df2, how='left')  # matches on column and left join 'how' on row indices creates NaN's

Unnamed: 0,Num1,Num2,Num5,Num3,Num4
0,11.658342,9.27865,0,-0.282933,-0.321274
1,10.393543,10.795164,1,-0.107121,-0.28515
2,8.557346,15.367598,2,-0.085814,0.388719
3,8.951174,9.1052,3,-1.254169,0.967458
4,10.247929,10.868042,4,0.275841,0.552301
5,4.391504,12.033087,5,,
6,12.40059,9.941493,6,,
7,11.508577,12.943454,7,,
8,8.200311,7.928159,8,,
9,12.738399,11.936196,9,,


In [47]:
pd.merge(df1,df2, how='outer')  # matches on column and outer join 'how' includes all row indices  & creates NaN's

Unnamed: 0,Num1,Num2,Num5,Num3,Num4
0,11.658342,9.27865,0,-0.282933,-0.321274
1,10.393543,10.795164,1,-0.107121,-0.28515
2,8.557346,15.367598,2,-0.085814,0.388719
3,8.951174,9.1052,3,-1.254169,0.967458
4,10.247929,10.868042,4,0.275841,0.552301
5,4.391504,12.033087,5,,
6,12.40059,9.941493,6,,
7,11.508577,12.943454,7,,
8,8.200311,7.928159,8,,
9,12.738399,11.936196,9,,


In [48]:
# order of the dataframes dictates order of columns in the output
pd.merge(df2,df1, how='outer')  # matches on column and outer join 'how' includes all row indices  & creates NaN's

Unnamed: 0,Num3,Num4,Num5,Num1,Num2
0,-0.282933,-0.321274,0,11.658342,9.27865
1,-0.107121,-0.28515,1,10.393543,10.795164
2,-0.085814,0.388719,2,8.557346,15.367598
3,-1.254169,0.967458,3,8.951174,9.1052
4,0.275841,0.552301,4,10.247929,10.868042
5,,,5,4.391504,12.033087
6,,,6,12.40059,9.941493
7,,,7,11.508577,12.943454
8,,,8,8.200311,7.928159
9,,,9,12.738399,11.936196


### Working with Dates in Pandas
* index_col parameter sets the index to the column reference
* parse_dates parameter indicates which column(s) to try and create date objects from
* squeeze parameter indicates whether to represent as a single series if possible

In [49]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'daily-minimum-temperatures.csv')
df = pd.read_csv(filepath)

df.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [50]:
# notice the data types - Date column is just an Object
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    3650 non-null   object 
 1   Temp    3650 non-null   float64
dtypes: float64(1), object(1)
memory usage: 57.2+ KB


In [51]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), '../../Data', 'daily-minimum-temperatures.csv')

# using the parse_dates argument and indicating what column to parse
df = pd.read_csv(filepath, parse_dates=['Date'])

df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/home/jerjacob/ClassWork/lecture_notebooks/data/daily-minimum-temperatures.csv'

In [None]:
# notice the data types - Date column is now a datetime object
df.info()

#### Setting Additional Parameters on the Load

- Setting the index_col parameter to the column that should be in the index. In this case Date
- Setting the parse_dates to a boolean instead of a column indicates the operation should apply to the index
- Setting the squeeze parameter to True squeezes the one column dataframe down to type Series

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'daily-minimum-temperatures.csv')

# setting the index_col parameter to the column that should be in the index. In this case Date
# setting the parse_dates to a boolean instead of a column indicates the operation should apply to the index
df = pd.read_csv(filepath,header=0, index_col=0, parse_dates=True)

df.head()

In [None]:
# notice the data types - the index is a datetime object
df.info()

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'daily-minimum-temperatures.csv')

# setting the squeeze parameter to True squeezes the one column dataframe down to type Series
df = pd.read_csv(filepath,header=0, index_col=0, parse_dates=True, squeeze=True)

df.head()

In [None]:
# notice what class df is
print(type(df))

# notice the dtype of the index of the Series
df.index.values

### New Dataset with Timestamp values

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'sampleTs.csv')

# parsing the TimeStamp column as a date
df = pd.read_csv(filepath, parse_dates=['TimeStamp'])

df.head()

In [None]:
# notice the dtype of the TimeStamp column
df.info()

#### Each datetime object has the following properties
* month
* year
* hour
* day
* minute
* second

In [None]:
print("Month")
print(df.TimeStamp.dt.month.head(2))
print("Year")
print(df.TimeStamp.dt.year.head(2))
print("Hour")
print(df.TimeStamp.dt.hour.head(2))
print("Day")
print(df.TimeStamp.dt.day.head(2))
print("Minute")
print(df.TimeStamp.dt.minute.head(2))
print("Second")
print(df.TimeStamp.dt.second.head(2))

#### Can also find the minimum and maximum values

In [None]:
print('Maximum Month: ',df.TimeStamp.dt.month.max())

print("Minimum Timestamp: ",df.TimeStamp.min())

In [None]:
df.TimeStamp.dt.date.max()

#### Feature Engineering
* Add Month, Day, Year, Hour, Weekday (as integer), and BusDay (as boolean) Features
* Using List Comprehension

In [None]:
import numpy as np

df['Month'] = [df.TimeStamp[i].month for i in range(len(df.TimeStamp))]
df['Day'] = [df.TimeStamp[i].day for i in range(len(df.TimeStamp))]
df['Year'] = [df.TimeStamp[i].year for i in range(len(df.TimeStamp))]
df['Hour'] = [df.TimeStamp[i].hour for i in range(len(df.TimeStamp))]
df['Weekday'] = [df.TimeStamp.dt.date[i].weekday() for i in range(len(df.TimeStamp.dt.date))]
df['BusDay'] = np.is_busday(df.TimeStamp.dt.date)


In [None]:
df.head()

In [None]:
%%time
df['Month'] = [df.TimeStamp[i].month for i in range(len(df.TimeStamp))]
df['Day'] = [df.TimeStamp[i].day for i in range(len(df.TimeStamp))]
df['Year'] = [df.TimeStamp[i].year for i in range(len(df.TimeStamp))]
df['Hour'] = [df.TimeStamp[i].hour for i in range(len(df.TimeStamp))]
df['Weekday'] = [df.TimeStamp.dt.date[i].weekday() for i in range(len(df.TimeStamp.dt.date))]

#### Using vectorization

In [None]:
%%time
df['Month2'] = df.TimeStamp.dt.month
df['Day2'] = df.TimeStamp.dt.day
df['Year2'] = df.TimeStamp.dt.year
df['Hour2'] = df.TimeStamp.dt.hour
df['Weekday2'] = df.TimeStamp.dt.weekday

In [None]:
df.head()

#### Add WeekdayText Feature as a Categorical
* Using List Comprehension

In [None]:
%%time

# weekdays as a tuple
# Monday is day 0
days = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")

df['WeekdayText'] = pd.Categorical([days[df.TimeStamp.dt.date[i].weekday()] for i in range(len(df.TimeStamp.dt.date))])

#### Using day_name() & month_name()

In [None]:
%%time
df['WeekdayText2'] = pd.Categorical(df.TimeStamp.dt.day_name())

In [None]:
df['MonthText'] = df.TimeStamp.dt.month_name()

In [None]:
df.head()

#### Add Qtr and QtrText Features

In [None]:
qtrs = ("First","Second","Third","Fourth")
df['Qtr'] = pd.Categorical(df.TimeStamp.dt.quarter)
df['QtrText'] = pd.Categorical([qtrs[df.Qtr[i]-1] for i in range(len(df.TimeStamp.dt.date))])

In [None]:
# notice the dtype of WeekdayText compared to MonthText
print(df.info())
df.head()

#### Working with Time Zones of a datetime
* First step is to localize - which means defining what time zone the original data is in
* A timezone's offset refers to how many hours the timezone is from Coordinated Universal Time (UTC)
* A naive datetime object contains no timezone information. 
* Check tzinfo of a datetime object. tzinfo will be set to None if the object is naive.

In [None]:
import pandas as pd
import os

filepath = os.path.join(os.getcwd(), 'data', 'sampleTs.csv')

# parsing the TimeStamp column as a date
df = pd.read_csv(filepath, parse_dates=['TimeStamp'])

In [None]:
# notice the datetime is naive
print(df.TimeStamp[0].tzinfo)

In [None]:
# localizing to US Central Time Zone
df['TimeStamp'] = df.TimeStamp.dt.tz_localize('US/Central')

# now the datetime is aware
df.TimeStamp[0].tzinfo

In [None]:
# notice the attributes of TimeStamp
df.info()

In [None]:
# create a new datetime column that converts the original timestamp to UTC
df['TimeStamp_UTC'] = df['TimeStamp'].dt.tz_convert('UTC')

In [None]:
df.head(2)

#### Create a datetime index using date_range

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html

In [None]:
# 3 hour frequency
pd.date_range(start = '2019-01-01', end = '2019-01-02', freq = '3H')

In [None]:
# 1 day frequency
pd.date_range(start = '2019-01-01', end = '2019-01-05', freq = 'D')

In [None]:
# Annual on first day of year for 10 years
pd.date_range(start = '2019-01-01', periods = 10, freq = 'AS')

In [None]:
# add a new column to the dataframe that is a datetime index with a 3 hour interval
df['NewDatetime'] = pd.date_range(start = '2019-01-01', periods = len(df), freq = '3H')

In [None]:
print(df.info())
df.head()

In [None]:
df.tail()