In [4]:
import numpy as np
import pandas as pd

from collections import defaultdict as defd

import math,time
from random import choice
from scipy.spatial.transform import Rotation
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from scipy import signal
from scipy.ndimage import uniform_filter1d
from scipy.interpolate import interp1d

In [6]:
from sklearn.linear_model import LinearRegression as OLS
from sklearn.decomposition import PCA

# XGBoost Python Wrapper for SKLearn: 
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
#import xgboost as xgb 
#from xgboost import XGBRegressor  as XGR
#from xgboost import XGBClassifier as XGC 

In [7]:
#############################################################################
#############################################################################

In [8]:
# Interpolation Model: 
class InterpModel:
    def __init__(self):  
        pass
    
    def fit(self,x_train,y_train): 
        x = np.array(x_train)
        y = np.array(y_train)

        if x.shape == (len(x),1): x = x.reshape([len(x),]) 
        if y.shape == (len(y),1): y = y.reshape([len(y),]) 
              
        self.lin_predict = interp1d(x,y)
        #self.cub_predict = interp1d(x,y,kind='cubic') 
        self.xmin = x.min()
        self.xmax = x.max() 
        
    def predict(self,x_test,kind='linear'):
        x = np.array(x_test) 
        if x.shape == (len(x),1): x = x.reshape([len(x),]) 
        x2 = np.clip(x,self.xmin,self.xmax)
        if kind=='linear': preds = self.lin_predict(x2)
        #if kind=='cubic':  preds = self.cub_predict(x2) 
        return preds 


class QuantileModel:
    def __init__(self):
        self.v2q_model = InterpModel()
        self.q2v_model = InterpModel()
    
    def fit(self,arr): 
        ys = np.sort(np.array(arr))
        xs = np.linspace(0.0,1.0,len(ys))
        self.v2q_model.fit(ys,xs)
        self.q2v_model.fit(xs,ys) 
        
    def v2q(self,value):
        return self.v2q_model.predict(value)
    
    def q2v(self,quant):
        return self.q2v_model.predict(quant) 
    
    def predict(self,value):
        return self.v2q(value)
        
        

In [9]:
#############################################################################
#############################################################################

In [10]:
# Simulation Variables:
N_DIMS  = 6
N_USERS = 1000
N_ITEMS = 500 
MIN_REV = 10
MAX_REV = 60

In [11]:
np.random.uniform(-9.99,+9.99,(2,3)) 

array([[ 9.64581553,  8.50794684, -1.77934813],
       [ 5.15000887, -1.11686232,  5.66528626]])

In [12]:
UserPrefMatrix = np.random.uniform(-9.99,+9.99,(N_USERS,N_DIMS))
UserPrefMatrix = np.around(UserPrefMatrix,7)

UserDF = pd.DataFrame(UserPrefMatrix) 
DimCols = ["D"+str(a) for a in range(1,N_DIMS+1)]
UserDF.columns = DimCols

UserIDs = np.arange(1,N_USERS+1) 
UserIDs = ["U"+(('00000'+str(a))[-4:]) for a in UserIDs]
UserDF['UID'] = UserIDs
old_cols = list(UserDF.columns)
new_cols = [old_cols[-1]]+old_cols[:-1]
UserDF = UserDF[new_cols] 

user2vec = {} 
for r in UserDF.values:
    uid = r[0]
    vec = r[1:] 
    user2vec[uid] = vec

In [13]:
ItemAttrMatrix = np.random.uniform(-9.99,+9.99,(N_ITEMS,N_DIMS))
ItemAttrMatrix = np.around(ItemAttrMatrix,7)

ItemDF = pd.DataFrame(ItemAttrMatrix) 
DimCols = ["D"+str(a) for a in range(1,N_DIMS+1)]
ItemDF.columns = DimCols

ItemIDs = np.arange(1,N_ITEMS+1) 
ItemIDs = ["I"+(('00000'+str(a))[-4:]) for a in ItemIDs]
ItemDF['IID'] = ItemIDs
old_cols = list(ItemDF.columns)
new_cols = [old_cols[-1]]+old_cols[:-1]
ItemDF = ItemDF[new_cols] 

item2vec = {} 
for r in ItemDF.values:
    iid = r[0]
    vec = r[1:] 
    item2vec[iid] = vec

In [14]:
#def VecDist(vec1,vec2):
#    return np.absolute(np.array(vec1)-np.array(vec2)).mean() 

def VecDist(v1,v2):
    return np.abs((v1-v2)**2).sum(1)**0.5

vec1 = user2vec['U0001']
vec2 = item2vec['I0001'] 

vec1 = np.array([vec1])
vec2 = np.array([vec2]) 

VecDist(vec1,vec2)  

array([14.318853013091415], dtype=object)

In [15]:
print(vec1.shape)
print(vec2.shape)

(1, 6)
(1, 6)


In [16]:
#############################################################################
#############################################################################

In [17]:
rows = []
for uid in UserIDs:
    uvec = user2vec[uid]
    uvec = np.array([uvec])
    
    n_revs = np.random.randint(MIN_REV,MAX_REV)
    items = [choice(ItemIDs) for _ in range(n_revs)] 
    items = sorted(set(items)) 
    for iid in items:
        ivec = item2vec[iid]
        ivec = np.array([ivec])
        
        dist = VecDist(uvec,ivec)[0] 
        rows.append([uid,iid,dist])
    
RevDF = pd.DataFrame(rows) 
RevDF.columns = ['UID','IID','DIST'] 

In [18]:
qm = QuantileModel()
qm.fit(RevDF['DIST'].values) 
dper = qm.v2q(RevDF['DIST'].values) 
RevDF['DPER'] = dper
RevDF['AFFIN'] = (1-RevDF['DPER'])**2
RevDF['RATING'] = np.around((RevDF['AFFIN']*9)+1).astype(int) 

In [19]:
#############################################################################
#############################################################################

In [20]:
rand_arr = np.random.uniform(0,1,len(RevDF))*0.7
TF = rand_arr < RevDF['AFFIN'].values 
RevDF['SALE'] = TF.astype(int)

In [21]:
RevDF['SALE'].mean()

0.44572274058640576

In [22]:
RevDF['SALE'].sum() / RevDF['UID'].nunique() 

14.761

In [23]:
#RevDF.head(10)

In [24]:
RevDF.columns

Index(['UID', 'IID', 'DIST', 'DPER', 'AFFIN', 'RATING', 'SALE'], dtype='object')

In [25]:
#cols = ['UID', 'IID','RATING']
#RevDF2 = RevDF[cols].copy() 

In [26]:
#RevDF2.head(10) 

In [27]:
Sales = RevDF[RevDF['SALE']==1].copy() 
Sales = RevDF.copy() 

In [28]:
Sales

Unnamed: 0,UID,IID,DIST,DPER,AFFIN,RATING,SALE
0,U0001,I0005,14.853480,0.188489,0.658550,7,1
1,U0001,I0021,31.931170,0.996678,0.000011,1,0
2,U0001,I0034,22.792512,0.752114,0.061448,2,0
3,U0001,I0049,20.146641,0.558491,0.194930,3,0
4,U0001,I0062,10.504776,0.038682,0.924132,9,1
...,...,...,...,...,...,...,...
33112,U1000,I0468,20.148129,0.558552,0.194877,3,0
33113,U1000,I0475,28.704105,0.974906,0.000630,1,0
33114,U1000,I0482,25.209374,0.882655,0.013770,1,0
33115,U1000,I0483,21.290781,0.646847,0.124717,2,0


In [None]:
# ['UID' ,'IID' ,'SALE']
# ['ENT1','ENT2','SCORE'] 

In [29]:
#############################################################################
#############################################################################
#############################################################################

In [30]:
#############################################################################
#############################################################################

In [49]:
jvm = JVectorModel() 
DF = Sales[['UID' ,'IID' ,'SALE']]
jvm.fit(DF)  

Fitting Begins.
10 CMD: 1.5711105
20 CMD: 1.013381
30 CMD: 0.7147976
40 CMD: 0.5173755
50 CMD: 0.3746104
60 CMD: 0.2729529
70 CMD: 0.2044825
80 CMD: 0.1597609
90 CMD: 0.1299052
100 CMD: 0.1090112
Fitting Complete.


In [46]:
DF

Unnamed: 0,UID,IID,SALE
0,U0001,I0005,1
1,U0001,I0021,0
2,U0001,I0034,0
3,U0001,I0049,0
4,U0001,I0062,1
...,...,...,...
33112,U1000,I0468,0
33113,U1000,I0475,0
33114,U1000,I0482,0
33115,U1000,I0483,0


In [47]:
jvm['U0001']

array([ 0.30208623, -0.12532243,  0.6585508 , -0.14540351,  0.08235703,
        0.1378403 , -0.4722236 ,  1.03716554,  0.58542606,  1.30812333,
        0.0965143 , -0.07121149,  0.19703156, -0.03555775, -0.78360666,
        0.20875799, -0.58193859,  0.08948329,  0.04478507, -0.62416644,
        0.89321894, -0.99301733,  0.08243   , -1.51189381,  0.30488161,
        0.03147894, -0.86905832,  0.73368918, -0.18781901,  0.44960073])

In [4]:
x=1