# Final Capstone: Revisiting the Netflix Prize

## Notebook 4: Dimensionality Reduction (Unsupervised Technique)

There are three specific objectives here:

1. Combine highly correlated variables
2. Reduce overall volume; minimize information loss
3. Implement unsupervised learning technique into capstone

In [1]:
import numpy as np
import pandas as pd
import umap
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.4f}'.format)

## PCA

In [2]:
def scale_pca(data, col_name):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data)
    spca = PCA(n_components=1, whiten=True, random_state=47).fit(scaled)
    print('Explained Variance Ratio:', spca.explained_variance_ratio_)
    spca = PCA(n_components=1, whiten=True, random_state=47).fit_transform(scaled)
    return pd.DataFrame(spca, columns=[col_name])

In [3]:
%%time
# import data and reset_index to avoid mismatched indices when concatenating
base_path = 'C:/Users/jnpol/Documents/DS/Data Science/UL/'
quiz_features = pd.read_parquet(base_path + 'quiz_features.parquet')
quiz_features.reset_index(drop=True, inplace=True)
quiz_rows = len(quiz_features)
quiz_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   mov_id           1408395 non-null  int16  
 1   cust_id          1408395 non-null  int32  
 2   day_rated        1408395 non-null  int16  
 3   mov_year         1408395 non-null  int16  
 4   avg_rate_pm_pd   1408395 non-null  float32
 5   avg_rate_pc_pd   1408395 non-null  float32
 6   cust_day_count   1408395 non-null  int16  
 7   cust_days_since  1408395 non-null  int16  
 8   mov_days_since   1408395 non-null  int16  
 9   mov_avg_rating   1408395 non-null  float32
 10  cust_avg_rating  1408395 non-null  float32
 11  mov_day_avg      1408395 non-null  float32
 12  cust_day_avg     1408395 non-null  float32
 13  avg_rate_yr      1408395 non-null  float32
 14  avg_rate_cst_yr  1408395 non-null  float32
 15  bline_approx     1408395 non-null  float32
dtypes: float32(9), int

In [4]:
quiz_features.corr()

Unnamed: 0,mov_id,cust_id,day_rated,mov_year,avg_rate_pm_pd,avg_rate_pc_pd,cust_day_count,cust_days_since,mov_days_since,mov_avg_rating,cust_avg_rating,mov_day_avg,cust_day_avg,avg_rate_yr,avg_rate_cst_yr,bline_approx
mov_id,1.0,0.0001,0.0311,0.0263,-0.1135,0.0202,-0.3419,0.1129,-0.124,-0.0231,-0.0288,-0.0191,-0.0228,-0.0261,-0.0203,-0.0084
cust_id,0.0001,1.0,0.0009,0.0013,0.0006,-0.0012,-0.0017,-0.0021,0.0001,-0.0001,0.0002,0.0006,0.001,-0.001,-0.0005,0.0013
day_rated,0.0311,0.0009,1.0,0.0017,-0.1207,0.2219,-0.0223,0.1082,0.1276,0.0268,0.0313,0.0373,0.009,0.0306,0.0206,0.0032
mov_year,0.0263,0.0013,0.0017,1.0,0.2136,-0.0622,-0.0633,0.0035,-0.4538,-0.1729,-0.0183,-0.1607,-0.0328,-0.881,-0.1144,-0.0454
avg_rate_pm_pd,-0.1135,0.0006,-0.1207,0.2136,1.0,-0.2004,0.1166,-0.1443,0.0843,0.1823,0.049,0.1184,0.0253,-0.24,0.0042,-0.0131
avg_rate_pc_pd,0.0202,-0.0012,0.2219,-0.0622,-0.2004,1.0,0.0158,0.3545,0.0137,-0.0285,-0.1061,-0.0159,-0.0542,0.0748,-0.0559,0.0082
cust_day_count,-0.3419,-0.0017,-0.0223,-0.0633,0.1166,0.0158,1.0,-0.1506,0.1841,0.0309,0.0327,0.0352,0.0203,0.0669,0.0258,0.0105
cust_days_since,0.1129,-0.0021,0.1082,0.0035,-0.1443,0.3545,-0.1506,1.0,-0.1112,-0.0053,-0.17,-0.0174,-0.0624,0.0038,-0.0906,0.0235
mov_days_since,-0.124,0.0001,0.1276,-0.4538,0.0843,0.0137,0.1841,-0.1112,1.0,0.0626,0.0713,0.1201,0.0508,0.4692,0.1114,0.0615
mov_avg_rating,-0.0231,-0.0001,0.0268,-0.1729,0.1823,-0.0285,0.0309,-0.0053,0.0626,1.0,0.0403,0.7026,0.0587,0.1963,0.0478,0.0119


In [5]:
%%time
# perform PCA on the quiz data
pca_quiz = quiz_features[['mov_year', 'cust_days_since', 'mov_days_since',
                          'avg_rate_yr', 'avg_rate_cst_yr']].copy()
pc_quiz = scale_pca(pca_quiz, 'quiz_pc')
del pca_quiz

quiz_features = pd.concat([quiz_features, pc_quiz], axis=1)
quiz_features.to_parquet('quiz_pca.parquet')
del quiz_features, pc_quiz

Explained Variance Ratio: [0.45276017]
Wall time: 1.36 s


In [6]:
%%time
train_features = pd.read_parquet(base_path + 'train_features.parquet')
train_features.reset_index(drop=True, inplace=True)
train_rows = len(train_features)
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99072112 entries, 0 to 99072111
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   day_rated        int16  
 3   mov_year         int16  
 4   avg_rate_pm_pd   float32
 5   avg_rate_pc_pd   float32
 6   cust_day_count   int16  
 7   cust_days_since  int16  
 8   mov_days_since   int16  
 9   mov_avg_rating   float32
 10  cust_avg_rating  float32
 11  mov_day_avg      float32
 12  cust_day_avg     float32
 13  avg_rate_yr      float32
 14  avg_rate_cst_yr  float32
 15  bline_approx     float32
dtypes: float32(9), int16(6), int32(1)
memory usage: 4.8 GB
Wall time: 3.62 s


In [7]:
%%time
# perform PCA on the training data
pca_train = train_features[['mov_year', 'cust_days_since', 'mov_days_since',
                            'avg_rate_yr', 'avg_rate_cst_yr']].copy()
pc_train = scale_pca(pca_train, 'train_pc')
del pca_train

train_features = pd.concat([train_features, pc_train], axis=1)
train_features.to_parquet('train_pca.parquet')
del train_features, pc_train

Explained Variance Ratio: [0.42491698]
Wall time: 1min 33s


## UMAP

As with PCA, UMAP was attempted on the quiz set first, in order to estimate both processing time and memory consumption prior to processing the training set. I terminated the process after 3 minutes, since the training set would have taken a minimum of 3.5 hours to process. See below.

In [9]:
print('The training set is approximately',
      round(train_rows/quiz_rows, 2), 'times larger than the quiz set.')

The training set is approximately 68.38 times larger than the quiz set.
