# 01 - Recommender System

* "Given the values I _do_ have, predict my missing values"
* This is a kind of imputation / recommender system
* Straight-forward matrix decomposition methods can help here

In [302]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
from jlab import load_test_data

In [110]:
X_train = pd.read_csv('MLchallenge2_training.csv')
X_test = load_test_data('test_in.csv')
X = (pd.concat([X_test, X_train], axis=0)
     .reset_index(drop=True)
     .fillna(0.0))
X_true = pd.read_csv('test_prediction.csv', names=['x', 'y', 'px', 'py', 'pz'],
                     header=None)

In [111]:
X.head()

Unnamed: 0,x,y,z,px,py,pz,x1,y1,z1,px1,...,z23,px23,py23,pz23,x24,y24,z24,px24,py24,pz24
0,0.877,1.322,65.0,-0.244,-0.053,2.414,-10.669,0.33,176.944,-0.254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.786,-2.483,65.0,0.103,0.432,2.593,7.366,15.502,176.944,0.206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-13.134,-26.531,65.0,0.064,-0.021,0.953,-7.586,-30.687,176.944,0.027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.454,2.805,65.0,-0.019,0.069,1.833,18.043,6.797,176.944,0.013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15.552,-19.196,65.0,-0.01,-0.011,2.366,15.068,-19.75,176.944,-0.014,...,341.28,-0.014,-0.002,2.351,0.0,0.0,343.405,0.0,0.0,0.0


In [112]:
X_true.head()

Unnamed: 0,x,y,px,py,pz
0,-23.123945,3.142886,-0.235592,0.091612,2.413377
1,19.633486,32.319292,0.314376,0.316425,2.592952
2,-8.308506,-39.299613,-0.020097,-0.051232,0.948906
3,19.918838,10.664617,0.038102,0.04774,1.864014
4,13.649239,-20.616935,-0.015548,0.001471,2.323953


In [97]:
U, sigma, Vt = svds(X, k=30)
sigma = np.diag(sigma)

In [98]:
X_pred = pd.DataFrame(np.dot(np.dot(U, sigma), Vt), columns=X.columns, index=X.index)
X_pred.head()

Unnamed: 0,x,y,z,px,py,pz,x1,y1,z1,px1,...,z23,px23,py23,pz23,x24,y24,z24,px24,py24,pz24
0,1.139458,1.841577,65.000348,-0.132866,-0.152526,2.278765,-10.308074,0.077691,176.945038,-0.140774,...,-0.001327,-0.020493,-0.114052,0.212795,-0.317946,0.195674,-0.005499,-0.020159,-0.113749,0.212069
1,1.538296,-3.277263,65.000407,-0.142155,0.290338,2.350373,6.721598,15.147325,176.945195,-0.041034,...,-0.00596,-0.140109,0.100994,0.363288,0.135978,0.42487,-0.004066,-0.140975,0.100855,0.362115
2,-13.669357,-25.403695,64.999143,0.235995,0.027828,1.139225,-7.365034,-31.750689,176.941817,0.189261,...,-0.004733,0.290119,0.160645,-0.763249,0.596819,3.866317,0.006622,0.293293,0.166603,-0.759302
3,18.292252,2.048436,64.999907,-0.122082,0.101017,1.869429,18.183965,7.240228,176.943918,-0.092881,...,-0.006987,-0.185933,0.090368,-0.093341,1.670177,0.903375,0.008946,-0.176907,0.092282,-0.100364
4,15.6579,-19.32231,65.000323,-0.012487,0.010699,2.273879,14.897386,-19.569022,176.945024,-0.021853,...,341.263763,-0.022465,0.067983,2.253718,6.67395,-10.27062,343.40009,-0.025013,0.081539,2.256946


## Hooray, we did it

* Now we need to figure out how well it actually did

In [303]:
def get_test_detector_plane(row):
    # Find location of nans, get the first one
    # Then divide by 6 (6 values per detector plane)
    plane = np.where(np.isnan(row.values))[0][0]/6
    return int(plane)

In [304]:
def get_vals_at_plane(row, plane):
    cols = [i + str(int(plane)) for i in ['x','y','px','py','pz']]
    return row[cols].values

In [305]:
def get_vals_at_eval_plane(X_test, X_pred):
    X = X_pred.copy()
    X['eval_plane'] = X_test.apply(get_test_detector_plane, axis=1)
    retvals = X.loc[X_test.index.values].apply(lambda x: get_vals_at_plane(x, x['eval_plane']), axis=1)
    return retvals

In [306]:
eval_planes = X_test.apply(get_test_detector_plane, axis=1)

In [309]:
get_vals_at_plane(X_test.loc[15], 7)

array([24.406, 30.898,  0.139,  0.107,  2.34 ])

## Make a recommender class, a la sklearn

* Should have fit, predict methods

In [311]:
import logging
from jlab import COLS
from sklearn.preprocessing import StandardScaler

class DetectorRecommender(object):
    
    def __init__(self, k=20):
        
        self.logger = logging.getLogger(__name__)
        self.k = k
        self.planes = 27
        self.kinematics = ["x", "y", "px", "py", "pz"]
        self.cols = COLS
        self.X_train = pd.DataFrame(columns=self.cols)
        self.X_test = pd.DataFrame(columns=self.cols)
        self.scaler = StandardScaler()
        
    def fit(self, df):
        """SVD isn't really 'trained', but... """
        
        self.X_train = df.copy(deep=True)
        
    def predict(self, df):
        
        # Make a copy, index it from 0 to N
        self.logger.debug("Making a copy")
        self.X_test = df.copy(deep=True).reset_index(drop=True)
        
        # For each track, figure out which detector plane we'll evaluate
        self.logger.debug("Determining evaluation planes")
        eval_planes = self.X_test.apply(self.get_eval_detector_plane, axis=1)
        
        # Combine with the training set, shuffle it, and fill missing values
        self.logger.debug("Combining train and test sets for SVD")
        X = (pd.concat([self.X_test, self.X_train], axis=0)
             .reset_index(drop=True)
             .sample(replace=False, frac=1.0))
        
        # Fill with the mean values of each column
        self.logger.debug("Filling with mean values")
        X = X.fillna(X.mean())
        
        # Normalize the values
        self.logger.debug("Applying standardscaler")
        X_norm_values = self.scaler.fit_transform(X)
        X_norm = pd.DataFrame(X_norm_values, columns=X.columns, index=X.index)
        
        # Single-value Decomposition
        self.logger.debug("Making predictions")
        X_pred_norm = self.fit_predict_svds(X_norm)
        
        # Extract our test tracks
        X_pred_norm = X_pred_norm.loc[self.X_test.index, :].sort_index()
        
        # Un-normalize them
        X_pred_values = self.scaler.inverse_transform(X_pred_norm)
        X_pred = pd.DataFrame(X_pred_values, columns=X_pred_norm.columns,
                              index=X_pred_norm.index)
        self.logger.debug("De-normalized. Extracting pred values.")
        
        # Extract just the non-z kinematic values for the eval planes
        det_eval_values = self.extract_values_at_eval_planes(X_pred, eval_planes)
        
        return det_eval_values
    

    def fit_predict_svds(self, X):
        U, sigma, Vt = svds(X, k=self.k)
        sigma = np.diag(sigma)
        X_pred = pd.DataFrame(np.dot(np.dot(U, sigma), Vt),
                              columns=X.columns, index=X.index)
        return X_pred
        
    def extract_values_at_eval_planes(self, pred, planes):
        X = pred.copy(deep=True)
        X['eval_plane'] = planes
        retvals = X.apply(lambda x: self.get_vals_at_plane(x, x['eval_plane']), axis=1)
        retvals_df = pd.DataFrame(retvals.values.tolist(), columns=self.kinematics)
        return retvals_df
    
    def get_vals_at_plane(self, row, plane):
        cols = [i + str(int(plane)) for i in self.kinematics]
        return row[cols].values
    
    def get_eval_detector_plane(self, row):
        # Find location of nans, get the first one
        # Then divide by 6 (6 values per detector plane)
        plane = np.where(np.isnan(row.values))[0][0]/6
        return int(plane)

In [284]:
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(name)-12s - %(levelname)-8s - %(message)s')

In [285]:
predictor = DetectorRecommender()

In [286]:
predictor.fit(X_train)

In [287]:
X_pred = predictor.predict(X_test)

2019-10-29 07:24:55,659 __main__     DEBUG    Making a copy
2019-10-29 07:24:55,667 __main__     DEBUG    Determining evaluation planes
2019-10-29 07:24:55,770 __main__     DEBUG    Combining train and test sets for SVD
2019-10-29 07:24:56,855 __main__     DEBUG    Filling with mean values
2019-10-29 07:24:57,086 __main__     DEBUG    Applying standardscaler
2019-10-29 07:24:57,899 __main__     DEBUG    Making predictions
2019-10-29 07:24:59,484 __main__     DEBUG    De-normalized. Extracting pred values.


In [288]:
X_pred.head()

Unnamed: 0,x,y,px,py,pz
0,-10.40273,0.495083,-0.101857,0.040509,2.136227
1,9.098936,15.436974,0.151903,0.137151,2.22586
2,-4.508659,-18.078571,0.009943,0.003992,1.521642
3,10.06768,4.90541,0.003623,0.0205,1.921455
4,12.463507,-17.100075,-0.009114,0.004804,2.296975


In [290]:
mean_squared_error(X_true, X_pred)

37.71380513026001

## Tune the one hyperparameter we have

In [294]:
for k in range(5,15):
    predictor = DetectorRecommender(k=k)
    predictor.fit(X_train)
    X_pred = predictor.predict(X_test)
    print(k, mean_squared_error(X_true, X_pred))

2019-10-29 08:40:56,810 __main__     DEBUG    Making a copy
2019-10-29 08:40:56,824 __main__     DEBUG    Determining evaluation planes
2019-10-29 08:40:56,949 __main__     DEBUG    Combining train and test sets for SVD
2019-10-29 08:40:58,033 __main__     DEBUG    Filling with mean values
2019-10-29 08:40:58,278 __main__     DEBUG    Applying standardscaler
2019-10-29 08:40:59,122 __main__     DEBUG    Making predictions
2019-10-29 08:40:59,872 __main__     DEBUG    De-normalized. Extracting pred values.


KeyboardInterrupt: 

* Optimal performance at k=7

In [300]:
predictor = DetectorRecommender(k=7)
predictor.fit(X_train)
X_pred = predictor.predict(X_test)
print(mean_squared_error(X_true, X_pred))

2019-10-29 08:47:32,386 __main__     DEBUG    Making a copy
2019-10-29 08:47:32,391 __main__     DEBUG    Determining evaluation planes
2019-10-29 08:47:32,498 __main__     DEBUG    Combining train and test sets for SVD
2019-10-29 08:47:33,901 __main__     DEBUG    Filling with mean values
2019-10-29 08:47:34,069 __main__     DEBUG    Applying standardscaler
2019-10-29 08:47:35,066 __main__     DEBUG    Making predictions
2019-10-29 08:47:35,796 __main__     DEBUG    De-normalized. Extracting pred values.


20.43716499928241


## Surprise!

Try out this well-supported recommender package

In [314]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/dannowitz/Library/Caches/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.0


In [298]:
import surprise

ModuleNotFoundError: No module named 'surprise'

In [None]:
X.melt()

In [169]:
X.index.name = "track_id"

In [170]:
X.head().reset_index().melt(id_vars=['track_id'])

Unnamed: 0,track_id,variable,value
0,0,x,0.877
1,1,x,0.786
2,2,x,-13.134
3,3,x,18.454
4,4,x,15.552
5,0,y,1.322
6,1,y,-2.483
7,2,y,-26.531
8,3,y,2.805
9,4,y,-19.196


In [164]:
X.sample(replace=False, frac=1.0)

Unnamed: 0,x,y,z,px,py,pz,x1,y1,z1,px1,...,z23,px23,py23,pz23,x24,y24,z24,px24,py24,pz24
52641,-17.403500,-21.669100,65.0,0.018850,-0.070221,2.808130,-17.098200,-24.305200,176.944,-0.004557,...,341.280,-0.018001,-0.067943,2.800710,-18.208300,-28.138900,343.405,-0.017034,-0.068414,2.800680
153188,-2.030520,7.950830,65.0,0.140483,0.195651,2.262990,6.209280,16.391700,176.944,0.191073,...,341.280,0.224867,0.054636,2.249030,22.275300,23.367100,343.405,0.224612,0.054472,2.249040
26736,3.743760,-20.476100,65.0,-0.016751,-0.022347,0.948275,0.072519,-22.697500,176.944,-0.041323,...,341.280,-0.020188,0.031900,0.936075,-6.918930,-20.234000,343.405,-0.018757,0.031732,0.936098
113795,-8.214490,12.470600,65.0,-0.127324,-0.017557,2.029990,-15.177600,12.624900,176.944,-0.126454,...,341.280,-0.105276,0.072318,2.020760,-24.585400,17.047300,343.405,-0.105614,0.071741,2.020650
72875,5.645860,-16.347100,65.0,0.062687,-0.018573,0.946145,11.775000,-20.705000,176.944,0.038080,...,341.280,-0.015396,-0.050101,0.934989,12.384100,-31.306800,343.405,-0.014956,-0.049401,0.935021
86023,20.605700,16.160900,65.0,-0.026888,-0.161448,3.392590,19.447100,10.757900,176.944,-0.046720,...,341.280,-0.093216,-0.140715,3.382280,16.019200,2.902030,343.405,-0.093604,-0.139912,3.382290
8168,0.933000,3.174000,65.0,-0.361000,0.165000,2.568000,-13.749000,11.939000,176.944,-0.305000,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000
106263,1.880180,-0.830898,65.0,0.044663,0.153992,2.434700,4.864360,5.857340,176.944,0.084237,...,341.280,0.121784,0.095721,2.427940,12.198400,13.665300,343.405,0.121635,0.095944,2.427920
9991,3.837000,4.565000,65.0,0.101000,0.220000,2.121000,10.987000,14.983000,176.944,0.160000,...,341.280,0.209000,0.084000,2.081000,0.000000,0.000000,343.405,0.000000,0.000000,0.000000
68866,-0.303658,-3.715340,65.0,-0.108675,0.007009,1.478080,-8.248970,-1.828120,176.944,-0.098208,...,341.280,-0.052205,0.090764,1.471280,-16.913500,6.678260,343.405,-0.051851,0.090275,1.471310
