# Final Capstone

## Notebook 2: Feature Engineering

In [1]:
import time
start_time = time.perf_counter()
import numpy as np
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
%%time
# retrieve data exported from first notebook
base_path = 'C:/Users/jnpol/Documents/DS/Data Science/UL/'
all_ratings = pd.read_parquet(base_path + 'all_ratings.parquet')
quindex = pd.read_parquet(base_path + 'quindex.parquet')
net = pd.read_parquet(base_path + 'net1.parquet')

all_ratings.info()
print()
quindex.info()
print()
net.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97713135 entries, 0 to 100480506
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rating  int8 
dtypes: int8(1)
memory usage: 838.7 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 1 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   quindex  1408395 non-null  int64
dtypes: int64(1)
memory usage: 10.7 MB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97713135 entries, 0 to 100480506
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   mov_id     int16  
 1   cust_id    int32  
 2   rating     float64
 3   day_rated  int16  
 4   mov_year   int16  
dtypes: float64(1), int16(3), int32(1)
memory usage: 2.4 GB
Wall time: 3.47 s


## Additional Features
The 4 additional features below may be calculated on the training set only. They cannot be applied directly on the quiz set since the ratings are assumed to be unknown. However, they can be estimated in a variety of ways.

In [3]:
%%time
# used to select rows matching original quiz df index
quilist = list(quindex.quindex)

# add column indicating number of times movie was rated
net['mov_count'] = net.groupby(['mov_id'])['mov_id'].transform('count')
net.mov_count = net.mov_count.astype(np.int32)

# add column indicating the number of movies rated per cust
net['rated_bycust'] = net.groupby(['cust_id'])['cust_id'].transform('count')
net.rated_bycust = net.rated_bycust.astype(np.int16)

# add column indicating number of ratings submitted each day
net['rate_each_day'] = net.groupby(['day_rated'])['mov_id'].transform('count')
net.rate_each_day = net.rate_each_day.astype(np.int32)

# add column indicating number of ratings per movie submitted each day
net['mov_day_count'] = net.groupby(
    ['day_rated', 'mov_id'])['mov_id'].transform('count')
net.mov_day_count = net.mov_day_count.astype(np.uint16)

# add column indicating number of times cust rated on that day
net['cust_day_count'] = net.groupby(
    ['cust_id', 'day_rated'])['mov_id'].transform('count')
net.cust_day_count = net.cust_day_count.astype(np.int16)

Wall time: 40.7 s


In [4]:
%%time
# add column indicating average rating per movie
net['mov_avg_rating'] = net.drop(quilist).groupby(
    ['mov_id'])['rating'].transform('mean')
net.mov_avg_rating = net.mov_avg_rating.astype(np.float32)

# add column indicating average rating per cust
net['cust_avg_rating'] = net.drop(quilist).groupby(
    ['cust_id'])['rating'].transform('mean')
net.cust_avg_rating = net.cust_avg_rating.astype(np.float32)

# add column indicating average rating per day
net['mov_day_avg_rl'] = net.drop(quilist).groupby(
    ['mov_id', 'mov_year', 'day_rated'])['rating'].transform('mean')
net.mov_day_avg_rl = net.mov_day_avg_rl.astype(np.float32)

# add column indicating average rating per release year per day
net['mov_day_avg'] = net.drop(quilist).groupby(
    ['mov_id', 'day_rated'])['rating'].transform('mean')
net.mov_day_avg = net.mov_day_avg.astype(np.float32)

# add column indicating the offset between cust rating and mov_avg_rating
net['cust_rating_offset'] = net.rating - net.mov_avg_rating
net.cust_rating_offset = net.cust_rating_offset.astype(np.float32)

# add column indicating difference between customer rating and avg movie rating
net['cust_avg_offset'] = net.drop(quilist).groupby(
    ['cust_id'])['cust_rating_offset'].transform('mean')
net.cust_avg_offset = net.cust_avg_offset.astype(np.float32)
net.drop(['cust_rating_offset'], 1, inplace=True)

Wall time: 2min 38s


In [5]:
%%time
# add column indicating daily average rating by the cust
net['cust_day_avg'] = net.drop(quilist).groupby(
    ['cust_id', 'day_rated'])['rating'].transform('mean')
net.cust_day_avg = net.cust_day_avg.astype(np.float32)

# add column indicating average rating per release year
net['avg_rate_mov_yr'] = net.drop(quilist).groupby(
    ['mov_year'])['rating'].transform('mean')
net.avg_rate_mov_yr = net.avg_rate_mov_yr.astype(np.float32)

# add column indicating average rating per customer per release year
net['avg_rate_cst_yr'] = net.drop(quilist).groupby(
    ['cust_id', 'mov_year'])['rating'].transform('mean')
net.avg_rate_cst_yr = net.avg_rate_cst_yr.astype(np.float32)

Wall time: 2min 2s


In [6]:
%%time
net.info()
display(net.head())
net.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97713135 entries, 0 to 100480506
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   rating           float64
 3   day_rated        int16  
 4   mov_year         int16  
 5   mov_count        int32  
 6   rated_bycust     int16  
 7   rate_each_day    int32  
 8   mov_day_count    uint16 
 9   cust_day_count   int16  
 10  mov_avg_rating   float32
 11  cust_avg_rating  float32
 12  mov_day_avg_rl   float32
 13  mov_day_avg      float32
 14  cust_avg_offset  float32
 15  cust_day_avg     float32
 16  avg_rate_mov_yr  float32
 17  avg_rate_cst_yr  float32
dtypes: float32(8), float64(1), int16(5), int32(3), uint16(1)
memory usage: 9.1 GB


Unnamed: 0,mov_id,cust_id,rating,day_rated,mov_year,mov_count,rated_bycust,rate_each_day,mov_day_count,cust_day_count,mov_avg_rating,cust_avg_rating,mov_day_avg_rl,mov_day_avg,cust_avg_offset,cust_day_avg,avg_rate_mov_yr,avg_rate_cst_yr
0,1,1488844,3.0,2125,2003,530,2205,198666,2,4,3.72,3.26,4.0,4.0,-0.26,3.25,3.51,3.22
1,1,822109,5.0,2009,2003,530,147,117315,2,11,3.72,3.99,5.0,5.0,0.41,4.36,3.51,4.0
2,1,885013,4.0,2168,2003,530,365,166192,3,3,3.72,3.84,4.0,4.0,0.07,4.0,3.51,3.47
3,1,30878,,2236,2003,530,1289,58338,1,7,,,,,,,,
4,1,823519,3.0,1636,2003,530,647,118618,1,34,3.72,3.9,3.0,3.0,0.32,3.91,3.51,3.96


Wall time: 2.82 s


mov_id                   0
cust_id                  0
rating             1408395
day_rated                0
mov_year                 0
mov_count                0
rated_bycust             0
rate_each_day            0
mov_day_count            0
cust_day_count           0
mov_avg_rating     1408395
cust_avg_rating    1408395
mov_day_avg_rl     1408395
mov_day_avg        1408395
cust_avg_offset    1408395
cust_day_avg       1408395
avg_rate_mov_yr    1408395
avg_rate_cst_yr    1408395
dtype: int64

In [7]:
%%time
net.sort_values(by=['cust_id', 'mov_year', 'avg_rate_cst_yr'], inplace=True)
net.avg_rate_cst_yr.fillna(method='ffill', inplace=True)

net.sort_values(by=['cust_id', 'cust_avg_rating'], inplace=True)
net.cust_avg_rating.fillna(method='ffill', inplace=True)

net.sort_values(by=['cust_id', 'cust_avg_offset'], inplace=True)
net.cust_avg_offset.fillna(method='ffill', inplace=True)

net.sort_values(by=['cust_id', 'day_rated', 'cust_day_avg'], inplace=True)
net.cust_day_avg.fillna(method='ffill', inplace=True)

net.sort_values(by=['mov_id', 'mov_year', 'day_rated', 'mov_day_avg_rl'], inplace=True)
net.mov_day_avg_rl.fillna(method='ffill', inplace=True)

Wall time: 3min 5s


In [8]:
%%time
net.sort_values(by=['mov_id', 'day_rated', 'mov_day_avg'], inplace=True)
net.mov_day_avg.fillna(method='ffill', inplace=True)

net.sort_values(by=['mov_id', 'mov_avg_rating'], inplace=True)
net.mov_avg_rating.fillna(method='ffill', inplace=True)

net.sort_values(by=['mov_year', 'avg_rate_mov_yr'], inplace=True)
net.avg_rate_mov_yr.fillna(method='ffill', inplace=True)

Wall time: 42.4 s


In [9]:
%%time
net.info()
display(net.head())
display(net.isna().sum())
net.sort_index(inplace=True)
net.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97713135 entries, 43515483 to 100349765
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   rating           float64
 3   day_rated        int16  
 4   mov_year         int16  
 5   mov_count        int32  
 6   rated_bycust     int16  
 7   rate_each_day    int32  
 8   mov_day_count    uint16 
 9   cust_day_count   int16  
 10  mov_avg_rating   float32
 11  cust_avg_rating  float32
 12  mov_day_avg_rl   float32
 13  mov_day_avg      float32
 14  cust_avg_offset  float32
 15  cust_day_avg     float32
 16  avg_rate_mov_yr  float32
 17  avg_rate_cst_yr  float32
dtypes: float32(8), float64(1), int16(5), int32(3), uint16(1)
memory usage: 6.6 GB


Unnamed: 0,mov_id,cust_id,rating,day_rated,mov_year,mov_count,rated_bycust,rate_each_day,mov_day_count,cust_day_count,mov_avg_rating,cust_avg_rating,mov_day_avg_rl,mov_day_avg,cust_avg_offset,cust_day_avg,avg_rate_mov_yr,avg_rate_cst_yr
43515483,7654,1312412,3.0,116,1896,146,1967,2363,1,268,3.65,3.46,3.0,3.0,-0.06,3.21,3.65,3.0
43515504,7654,947104,3.0,254,1896,146,1612,1845,1,79,3.65,3.22,3.0,3.0,-0.31,3.61,3.65,3.0
43515485,7654,1830265,5.0,338,1896,146,1636,2624,1,242,3.65,3.42,5.0,5.0,-0.13,3.43,3.65,5.0
43515426,7654,1328708,5.0,460,1896,146,817,5112,1,215,3.65,3.34,5.0,5.0,-0.24,3.4,3.65,5.0
43515455,7654,2449001,4.0,503,1896,146,1409,3571,1,5,3.65,3.35,4.0,4.0,-0.27,4.0,3.65,4.0


mov_id                   0
cust_id                  0
rating             1408395
day_rated                0
mov_year                 0
mov_count                0
rated_bycust             0
rate_each_day            0
mov_day_count            0
cust_day_count           0
mov_avg_rating           0
cust_avg_rating          0
mov_day_avg_rl           0
mov_day_avg              0
cust_avg_offset          0
cust_day_avg             0
avg_rate_mov_yr          0
avg_rate_cst_yr          0
dtype: int64

Wall time: 17.3 s


Unnamed: 0,mov_id,cust_id,rating,day_rated,mov_year,mov_count,rated_bycust,rate_each_day,mov_day_count,cust_day_count,mov_avg_rating,cust_avg_rating,mov_day_avg_rl,mov_day_avg,cust_avg_offset,cust_day_avg,avg_rate_mov_yr,avg_rate_cst_yr
0,1,1488844,3.0,2125,2003,530,2205,198666,2,4,3.72,3.26,4.0,4.0,-0.26,3.25,3.51,3.22
1,1,822109,5.0,2009,2003,530,147,117315,2,11,3.72,3.99,5.0,5.0,0.41,4.36,3.51,4.0
2,1,885013,4.0,2168,2003,530,365,166192,3,3,3.72,3.84,4.0,4.0,0.07,4.0,3.51,3.47
3,1,30878,,2236,2003,530,1289,58338,1,7,3.72,3.63,3.0,3.0,0.08,3.0,3.51,3.43
4,1,823519,3.0,1636,2003,530,647,118618,1,34,3.72,3.9,3.0,3.0,0.32,3.91,3.51,3.96


In [10]:
%%time
net['all_ratings'] = all_ratings.rating
net = net.sample(frac=1, random_state=171)
net.info()
net.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97713135 entries, 61342875 to 6598294
Data columns (total 19 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   rating           float64
 3   day_rated        int16  
 4   mov_year         int16  
 5   mov_count        int32  
 6   rated_bycust     int16  
 7   rate_each_day    int32  
 8   mov_day_count    uint16 
 9   cust_day_count   int16  
 10  mov_avg_rating   float32
 11  cust_avg_rating  float32
 12  mov_day_avg_rl   float32
 13  mov_day_avg      float32
 14  cust_avg_offset  float32
 15  cust_day_avg     float32
 16  avg_rate_mov_yr  float32
 17  avg_rate_cst_yr  float32
 18  all_ratings      int8   
dtypes: float32(8), float64(1), int16(5), int32(3), int8(1), uint16(1)
memory usage: 6.6 GB
Wall time: 46.9 s


Unnamed: 0,mov_id,cust_id,rating,day_rated,mov_year,mov_count,rated_bycust,rate_each_day,mov_day_count,cust_day_count,mov_avg_rating,cust_avg_rating,mov_day_avg_rl,mov_day_avg,cust_avg_offset,cust_day_avg,avg_rate_mov_yr,avg_rate_cst_yr,all_ratings
61342875,11182,549302,4.0,1696,2003,103062,459,70730,296,106,3.55,3.79,3.53,3.53,0.19,3.97,3.51,3.75,4
12081331,2342,1077245,3.0,1976,2004,87480,139,98810,126,3,3.87,4.27,3.72,3.72,0.47,3.67,3.52,3.94,3
99011741,17474,2463816,4.0,1567,1964,32089,1577,83753,41,7,4.09,3.08,4.2,4.2,-0.48,3.71,3.92,4.07,4
73636568,13372,2204684,3.0,1225,2002,350,829,15570,1,5,2.69,3.26,3.0,3.0,-0.28,3.4,3.5,3.32,3
66797623,12191,2032601,3.0,1679,1996,12782,1219,97725,34,6,2.97,3.48,2.65,2.65,0.04,3.67,3.55,3.34,3


In [11]:
%%time
net.to_parquet('net2.parquet')

Wall time: 28.7 s


In [12]:
end_time = time.perf_counter()
prog_ex_time = end_time - start_time
print('This program executes in {} seconds.'.format(prog_ex_time))

This program executes in 650.5682451 seconds.


That's 10 minutes and 50.6 seconds.