# Cold Start Regression Model
The purpose of this notebook is to learn the latent factors of an given ALS model using track and artist metadata. <br><br>
Inputs:
- The Item Factor matrix ($V$) from an ALS model with the shape $n\times 2$, where $n$ is the number of items included in the model. The 2nd column should contain a list with $r$ elements where $r$ is the rank of the model.
- A meta data matrix ($M$) with the shape $N\times d$ where $N=1,000,000$ which is the total number of songs and $d$ is the number of meta feautres we include in the model
- A hash ID matrix ($H$) with the shape $n\times2$ which allows us to link the given track id with it's hashed id from Spark

Output:
- A new $n\times 2$ matrix ($V'$) where most of the elements are the same as $V$ but some are instead held out and re-learned using a multi-dimensional output regression model. The 2nd column is a re-aggregated into a list of $r$ elements

### Read in Data

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import dask.dataframe as dd
import dask.array as da
import time

from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

In [2]:
#set up directories
data_dir_v = Path('../../data/itemFactors_r200/') #change this based on which V matrix to take in
data_dir_u = Path('../../data/userFactors_r200/') #change this based on which U matrix to take in
data_dir_h = Path('../../data/items_hash/')
data_dir_val = Path('../../data/valUsers/')
data_dir = '../../data/'

In [3]:
#file names to take in
meta_file = 'items_meta_updated.parquet'

In [4]:
#read in meta data and check shape
meta = pd.read_parquet(os.path.join(data_dir, meta_file))
print("M shape:",meta.shape)
meta.head()

M shape: (1000000, 590)


Unnamed: 0,track_id,duration,artist_familiarity,artist_hotttnesss,year,rock,electronic,alternative rock,pop,united states,...,clarinet,jam,folklore,greek music,experimental pop,techstep,belgium,psychedelic pop,soft rock,space rock
0,TRMMMYQ128F932D901,252.05506,0.649822,0.394032,2003,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TRMMMKD128F425225D,156.55138,0.439604,0.356992,1995,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TRMMMRX128F93187D9,138.97098,0.643681,0.437504,2006,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRMMMCH128F425532C,145.05751,0.448501,0.372349,2003,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TRMMMWA128F426B589,514.29832,0.0,0.0,0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#read in V and check shape
latentItems = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir_v.glob('*.parquet')
)

print("V shape:",latentItems.shape)
latentItems.head()

V shape: (320733, 2)


Unnamed: 0,id,features
0,-2147302072,"[-0.15491447, 0.085363604, -0.041555922, -0.00..."
1,-2140695272,"[-0.0003432132, 0.0060855947, 0.008764722, -0...."
2,-2140287472,"[0.046909656, 0.014843966, -0.021118047, 0.019..."
3,-2137637672,"[0.0077868393, 0.0074665216, 0.0006630516, -0...."
4,-2137226272,"[0.035141483, 0.04235267, -0.05442439, -0.0261..."


In [6]:
#read in H and check shape
hashed = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir_h.glob('*.parquet')
)

print("H shape:",hashed.shape)
hashed.head()

H shape: (385371, 2)


Unnamed: 0,track_id,item_hashId
0,TRPDNGZ128F4236B10,-1092796021
1,TRHEQEQ12903CDA12D,1105800579
2,TRKSXVV128E0788A61,-2089708821
3,TRKLMTV128F1470AD1,-1838828421
4,TRKYZYP128E0789089,504742979


### Clean Up

In [7]:
#join V with H
lat_hash = pd.merge(latentItems, hashed, how='inner', left_on='id', right_on='item_hashId')
lat_hash = lat_hash[['id','features','track_id']]

#join this new dataframe with the meta data
full = pd.merge(lat_hash, meta, how='inner', left_on='track_id', right_on='track_id')

full.head(2)

Unnamed: 0,id,features,track_id,duration,artist_familiarity,artist_hotttnesss,year,rock,electronic,alternative rock,...,clarinet,jam,folklore,greek music,experimental pop,techstep,belgium,psychedelic pop,soft rock,space rock
0,-2147302072,"[-0.15491447, 0.085363604, -0.041555922, -0.00...",TRIJIDA12903D01B00,191.32036,0.790638,0.445245,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2140695272,"[-0.0003432132, 0.0060855947, 0.008764722, -0....",TRLVDGV128F92DE95D,260.67546,0.708442,0.452368,2008,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#clean up this dataframe
full = full.drop(columns='track_id') #we dont need both hashed and unhashed ids

#get number of nulls and replace (nulls are missing in term table in DB)
print('Share of rows with nulls:',sum(full.rock.isna())/full.shape[0])

full = full.fillna(value=0) 
full.shape

Share of rows with nulls: 0.0019080102134664367


(320753, 591)

### Pre-processing

In [9]:
#split into learning and holdout
full_train = full.sample(frac=0.8, random_state=13)
full_test = full.drop(full_train.index)

full_train.shape, full_test.shape

#save the ids in order for when we need to rebuild V'
train_ids = full_train.id.to_numpy()
test_ids = full_test.id.to_numpy()

In [10]:
#split into Xs and Ys
Y_train = full_train.features
Y_train = pd.DataFrame(Y_train.to_list()) #ensure that each element in the list becomes it's own feature
X_train = full_train.drop(columns=['id','features'])
X_train = X_train.reset_index().drop(columns='index') #need to reset index to align with the Y data


Y_test = full_test.features
Y_test = pd.DataFrame(Y_test.to_list())
X_test = full_test.drop(columns=['id','features'])
X_test = X_test.reset_index().drop(columns='index')

### Modeling

In [11]:
#scale the feautres
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [12]:
#testing Knn
start = time.time()
model_knn = KNeighborsRegressor(n_neighbors=1)
model_knn.fit(X_train_sc, Y_train)
end = time.time()
print('time to fit minutes:',(end-start)/60)

time to fit minutes: 5.565398347377777


In [13]:
#testing ridge
regs = np.logspace(-3,2,6)

rmses = np.zeros(regs.size)
r2s = np.zeros(regs.size)

for i, r in enumerate(regs):

    model = Ridge(alpha=r)
    model.fit(X_train_sc, Y_train)

    rmses[i] = mean_squared_error(Y_test, model.predict(X_test_sc))
    r2s[i] = model.score(X_test_sc, Y_test)
    
rmses, r2s

(array([0.00093524, 0.00093524, 0.00093524, 0.00093524, 0.00093524,
        0.00093524]),
 array([0.00688166, 0.00688166, 0.00688166, 0.00688167, 0.00688172,
        0.00688222]))

In [14]:
#testing lasso
rmses_l = np.ones(regs.size)
r2s_l = np.ones(regs.size)

for i, r in enumerate(regs):

    model = Lasso(alpha=r, max_iter=2000)
    model.fit(X_train_sc, Y_train)

    rmses_l[i] = mean_squared_error(Y_test, model.predict(X_test_sc))
    r2s_l[i] = model.score(X_test_sc, Y_test)
    
rmses_l, r2s_l

'rmses_l = np.ones(regs.size)\nr2s_l = np.ones(regs.size)\n\nfor i, r in enumerate(regs):\n\n    model = Lasso(alpha=r, max_iter=2000)\n    model.fit(X_train_sc, Y_train)\n\n    rmses_l[i] = mean_squared_error(Y_test, model.predict(X_test_sc))\n    r2s_l[i] = model.score(X_test_sc, Y_test)\n    \nrmses_l, r2s_l'

### Creating Updated Latent Factor Matrix

In [15]:
#using kNN -- ridge performed poorly on final recomendation output
start = time.time()
preds = model_knn.predict(X_test_sc)
end = time.time()
print("minutes to run:",(end-start)/60)

minutes to run: 215.50021171967188


In [16]:
#read the hashed ids to the training data
Y_train['id'] = train_ids

#rebuild dataframe with predictions and hashed id
preds_df = pd.DataFrame(data=np.hstack((preds, test_ids.reshape(-1,1))),
            columns=Y_train.columns)
preds_df.id = preds_df.id.astype(int)

In [17]:
# Drop duplicates
preds_df = preds_df.drop_duplicates("id")
Y_train = Y_train.drop_duplicates("id")

In [18]:
#convert separate columns into single column of lists
Y_train_t = Y_train.melt(id_vars='id').groupby('id')['value'].apply(list).reset_index()
Y_train_t.head()

Unnamed: 0,id,value
0,-2147438027,"[-8.256235014414415e-05, -0.000736088142730295..."
1,-2147427751,"[0.03721501678228378, 0.05087555944919586, 0.0..."
2,-2147418984,"[0.0034650685265660286, -0.0009087168728001416..."
3,-2147401029,"[-0.0026286737993359566, -0.003067507874220609..."
4,-2147354705,"[-0.040815819054841995, -0.03625775873661041, ..."


In [19]:
#prepeat process for predictions
preds_df_t = preds_df.melt(id_vars='id').groupby('id')['value'].apply(list).reset_index()
preds_df_t.head()

Unnamed: 0,id,value
0,-2147410333,"[-0.005026502069085836, 3.582580029615201e-05,..."
1,-2147356230,"[-0.07830337435007095, -0.05532076582312584, -..."
2,-2147310285,"[0.02426794171333313, -0.000406861916417256, -..."
3,-2147303676,"[-0.019352559000253677, -0.007124537602066994,..."
4,-2147302072,"[0.0008778794435784221, -0.016984306275844574,..."


In [20]:
#combine truth from training with held out predictions
latentItems_new = Y_train_t.append(preds_df_t)
latentItems_new = latentItems_new.rename(columns={"value": "features"})
latentItems_new = latentItems_new.reset_index(drop=True)

latentItems_new.head()

Unnamed: 0,id,features
0,-2147438027,"[-8.256235014414415e-05, -0.000736088142730295..."
1,-2147427751,"[0.03721501678228378, 0.05087555944919586, 0.0..."
2,-2147418984,"[0.0034650685265660286, -0.0009087168728001416..."
3,-2147401029,"[-0.0026286737993359566, -0.003067507874220609..."
4,-2147354705,"[-0.040815819054841995, -0.03625775873661041, ..."


### Write Out to Parquet

In [21]:
#parquet file is too big. need to write out in chunks
div = np.floor(latentItems_new.shape[0]/10).astype(int)

#update name before running
for i in range(10):
    new_path = '../../data/itemFactors_r200_knnupdate{}_v2.parquet'.format(i)
    
    if i < 9:
        latentItems_new.iloc[div*i:div*(i+1)].to_parquet(new_path)
        
    else:
        latentItems_new.iloc[div*i:].to_parquet(new_path)