# Final Capstone: Revisiting the Netflix Prize

## Notebook 4: Dimensionality Reduction (Unsupervised Technique)

There are three specific objectives here:

1. Combine highly correlated variables
2. Reduce overall volume; minimize information loss
3. Implement unsupervised learning technique into capstone

In [1]:
import numpy as np
import pandas as pd
import umap
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.4f}'.format)

## PCA

In [2]:
def scale_pca(data, col_name):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data)
    spca = PCA(n_components=1, whiten=True, random_state=47).fit(scaled)
    print('Explained Variance Ratio:', spca.explained_variance_ratio_)
    spca = PCA(n_components=1, whiten=True, random_state=47).fit_transform(scaled)
    return pd.DataFrame(spca, columns=[col_name])

In [3]:
%%time
# import data and reset_index to avoid mismatched indices when concatenating
base_path = 'C:/Users/jnpol/Documents/DS/Data Science/UL/'
quiz_features = pd.read_parquet(base_path + 'quiz_features.parquet')
quiz_features.reset_index(drop=True, inplace=True)
quiz_rows = len(quiz_features)
quiz_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 22 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   mov_id           1408395 non-null  int16  
 1   cust_id          1408395 non-null  int32  
 2   day_rated        1408395 non-null  int16  
 3   mov_year         1408395 non-null  int16  
 4   mov_count        1408395 non-null  int32  
 5   rated_bycust     1408395 non-null  int16  
 6   rate_each_day    1408395 non-null  int32  
 7   mov_day_count    1408395 non-null  uint16 
 8   cust_day_count   1408395 non-null  int16  
 9   cust_days_since  1408395 non-null  int16  
 10  mov_days_since   1408395 non-null  int16  
 11  mov_avg_rating   1408395 non-null  float32
 12  cust_avg_rating  1408395 non-null  float32
 13  mov_day_avg_rl   1408395 non-null  float32
 14  mov_day_avg      1408395 non-null  float32
 15  cust_avg_offset  1408395 non-null  float32
 16  cust_day_avg     1

In [4]:
%%time
cpca_quiz = quiz_features[['cust_avg_offset', 'cust_glob_diff']].copy()
cust_pc_quiz = scale_pca(cpca_quiz, 'cust_pc')
del cpca_quiz

mpca_quiz = quiz_features[['mov_day_avg_rl', 'mov_avg_rating',
                           'mov_glob_diff']].copy()

quiz_features.drop(['cust_avg_offset', 'cust_glob_diff', 'mov_day_avg_rl',
                     'mov_avg_rating', 'mov_glob_diff'], 1, inplace=True)

mov_pc_quiz = scale_pca(mpca_quiz, 'mov_pc')
del mpca_quiz

quiz_features = pd.concat([quiz_features, cust_pc_quiz, mov_pc_quiz], axis=1)
del cust_pc_quiz, mov_pc_quiz
quiz_features.to_parquet('quiz_pca.parquet')
del quiz_features

Explained Variance Ratio: [0.97932469]
Explained Variance Ratio: [0.86940819]
Wall time: 1.73 s


In [5]:
%%time
train_features = pd.read_parquet(base_path + 'train_features.parquet')
train_features.reset_index(drop=True, inplace=True)
train_rows = len(train_features)
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 22 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   day_rated        int16  
 3   mov_year         int16  
 4   mov_count        int32  
 5   rated_bycust     int16  
 6   rate_each_day    int32  
 7   mov_day_count    uint16 
 8   cust_day_count   int16  
 9   cust_days_since  int16  
 10  mov_days_since   int16  
 11  mov_avg_rating   float32
 12  cust_avg_rating  float32
 13  mov_day_avg_rl   float32
 14  mov_day_avg      float32
 15  cust_avg_offset  float32
 16  cust_day_avg     float32
 17  avg_rate_mov_yr  float32
 18  avg_rate_cst_yr  float32
 19  global_mean      float32
 20  cust_glob_diff   float32
 21  mov_glob_diff    float32
dtypes: float32(11), int16(7), int32(3), uint16(1)
memory usage: 6.5 GB
Wall time: 4.55 s


In [6]:
%%time
cpca_train = train_features[['cust_avg_offset', 'cust_glob_diff']].copy()
cust_pc_train = scale_pca(cpca_train, 'cust_pc')
del cpca_train

Explained Variance Ratio: [0.94256899]
Wall time: 33.8 s


In [7]:
%%time
mpca_train1 = train_features[['mov_day_avg_rl', 'mov_avg_rating', 'mov_glob_diff']].copy()
mpca_train2 = train_features[['mov_avg_rating', 'mov_glob_diff']].copy()
mpca_train3 = train_features[['mov_day_avg_rl', 'mov_glob_diff']].copy()
mpca_train4 = train_features[['mov_day_avg_rl', 'mov_avg_rating']].copy()

Wall time: 1.93 s


In [8]:
%%time
mov_pc_train1 = scale_pca(mpca_train1, 'mov_pc_all')
del mpca_train1

Explained Variance Ratio: [0.86009601]
Wall time: 46.4 s


In [9]:
%%time
mov_pc_train2 = scale_pca(mpca_train2, 'mov_pc_no_rl')
del mpca_train2

Explained Variance Ratio: [0.95526213]
Wall time: 33.6 s


In [10]:
%%time
mov_pc_train3 = scale_pca(mpca_train3, 'mov_pc_no_mar')
del mpca_train3

Explained Variance Ratio: [0.84457002]
Wall time: 35.5 s


In [11]:
%%time
mov_pc_train4 = scale_pca(mpca_train4, 'mov_pc_no_mgd')
del mpca_train4

Explained Variance Ratio: [0.84462326]
Wall time: 33.8 s


In [12]:
%%time
train_features.drop(['cust_avg_offset', 'cust_glob_diff', 'mov_day_avg_rl',
                     'mov_avg_rating', 'mov_glob_diff'], 1, inplace=True)

mov_pcs_train = pd.concat([mov_pc_train1, mov_pc_train2,
                           mov_pc_train3, mov_pc_train4], axis=1)

mov_pcs_train.to_parquet('mov_pcs_train.parquet')
del mov_pcs_train

train_features = pd.concat([train_features, cust_pc_train], axis=1)
del cust_pc_train
train_features.to_parquet('train_pca.parquet')

Wall time: 39.8 s


## UMAP

As with PCA, UMAP was attempted on the quiz set first, in order to estimate both processing time and memory consumption prior to processing the training set. I terminated the process after 3 minutes, since the training set would have taken a minimum of 3.5 hours to process. See below.

In [9]:
print('The training set is approximately',
      round(train_rows/quiz_rows, 2), 'times larger than the quiz set.')

The training set is approximately 68.38 times larger than the quiz set.
