In [1]:
#!/usr/bin/env python
#Config
from pyspark import SparkConf, SparkContext, HiveContext
#import re
import numpy as np
import pandas as pd
#import datetime
#from pyspark.mllib.regression import LabeledPoint
#from pyspark.mllib.feature import HashingTF
#from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
#import scipy.sparse as sps
#from pyspark.mllib.linalg import Vectors
#import sklearn
import itertools
import datetime
import sys


#from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

try:
    sc.stop()
except NameError:
    pass

conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext()
hc = HiveContext(sc)
sc.setCheckpointDir('checkpoint/')

## SVD модель интернет-логов. Выделение тем. ##
Есть таблица интернет-логов  $m_{ij}$, каждая строка отвечает одному интернет-пользователю, каждый столбец - фрагменту урла. $m_{ij}=1$, если посетитель $i$ посетил фрагмент урла $j$ в рамках рассматриваемой сессии, иначе $m_{ij}=0$.

Выделим темы - группировки урлов и пользователей. Для этого воспользуемся модификацией SVD разложения. 

Зафиксируем количество тем $K$. Для каждого пользователя сформируем вектор длины $K$, отражающий степень близости темы к пользователю. Получаем матрицу $X_{ik}$. Индекс $i$ отвечает пользователю, $k$ отвечает теме. 

Аналогично сформируем вектор  длины $K$ для каждого фрагмента урла. Получаем матрицу $Y_{kj}$. $k$ отвечает теме, $j$ отвечает фрагменту урла.

Далее для каждого пользователя $i$ введем число $a_i$, которое в целом характеризует его интернет-активность.

Аналогично, элементы вектора $b_j$ отражают популярность урла $j$.

Параметр $\mu$ есть усредняющий скаляр по всей выборке.

Далее интерес пользователя  $i$ к фрагменту урла $j$ будем моделировать величиной $s_{ij} = \mu + a_i + b_j + \sum_k X_{ik} \cdot Y_{kj}$.

Для перевода этой величины в вероятность, воспользуемся логистической функцией: 
$\hat{m}_{ij} = 1/(1+e^{-s_{ij}})$. 

Таким образом, построенная величина $\hat{m}_{ij}$ есть  модельная оценка вероятности посещения пользователем $i$ фрагмента урла $j$.

Введем функцию ошибки $L = \sum_{ij} L_{ij} $,
$$L_{ij} = -m_{ij} \cdot \ln(\hat{m}_{ij}) - (1 - m_{ij}) \cdot \ln(1 - \hat{m}_{ij}) + \lambda_{0} \cdot \mu + \lambda_{1}\cdot (a_i^2 + b_j^2) + \lambda_{2} \cdot \sum_k(X_{ik}^2 + Y_{kj}^2)$$.

Для обучения будем использовать стохастический градиентный спуск (его модификацию Adadelta http://int8.io/comparison-of-optimization-techniques-stochastic-gradient-descent-momentum-adagrad-and-adadelta/#AdaDelta_8211_implementation).


In [2]:
start_time = datetime.datetime.now()

# Param selection
query = '''
select id_num,urlfr_num from 
(select id_num,urlfr_num,count(*) over (partition by id_num) as uf_cnt,count(*) over (partition by urlfr_num) as id_cnt from user_kposminin.visits_enum_20160412 
where id_num<40000 ) a
where id_cnt > 10 and uf_cnt > 9
'''

# where uf_cnt > 9 and id_cnt > 500


K = 20
id_cnt = 10000
urlfr_cnt = 3000000
epochs = 50


#lmbd = np.array([0.01, 0.01, 0.01])

#step = np.array([0.001, 0.005, 0.01])

param_grid = []

 #   [[0.00001, 0.0001, 0.0001],[0.01, 0.05, 0.01]],
for base_step in np.exp(np.arange(-4,0,2)):
    for base_lmbd in np.exp(np.arange(-14,-8,2)):
        for var_step in np.array([[0.1,1,0.1],[0.01,1,10]]):
            for var_lmbd in np.array([[0.01,0.1,1],[1,0.1,0.01]]):
                param_grid.append([var_step * base_step,var_lmbd * base_lmbd])

lmbd_corr = np.exp(np.arange(-3,3,6./K))

'''
# Load and parse the data
sampled_data = hc.sql(query) \
            .collect()
print('Sampled data consists of {} rows, {} id and {} uf.'.format(len(sampled_data),len(set([e[0] for e in sampled_data])),len(set([e[1] for e in sampled_data]))))

train,test = [], []
for r in sampled_data:
    if(np.random.rand() < 0.1):
        test.append(r)
    else:
        train.append(r)
'''  
#  
# prediction is m^hat_{ij} = 1/(1+exp(-s_{ij})) where s_{ij} = mu + a[i] + b[j] + sum_k (X[i,k] * Y[k,j])
# error function is L_{ij} = -m_{ij}*ln(m^hat_{ij}) - (1 - m_{ij})*ln(1 - m^hat_{ij}) + 
#                            lmbd[0] * mu + lmbd[1]*(a_i^2 + b_j^2) + lmbd[2]*sum_k(X_{ik}^2 + Y_{kj}^2)
f = open('data/collab_filter_SGD_param_tuning.csv','a+')
f.write('\n\n New calc at {}'.format(datetime.datetime.now()))
stat = []
prev_id = -1
uf_visited= []
neg_sig_share = 2
rho = 0.9
eps = 1e-6
errors = []
err_thres = 0.005

for step,lmbd in  [[[0.0135,0.1353,0.0135],[1e-04,1e-03,1e-02]]]:

    #Init matrices
    mu = np.random.rand(1) - 0.5 -6 # One-element array to be able to update inside a procedure
    a = np.random.rand(id_cnt) - 0.5
    b = np.random.rand(urlfr_cnt) - 0.5
    X = np.random.rand(id_cnt, K) - 0.5
    Y = np.random.rand(K,urlfr_cnt) - 0.5

    def make_step(i, j, v):
        s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
        pred = 1./(1+np.exp(-s))
        err = v - pred # r[2] - pred
        grad_mu = -err + lmbd[0] * mu
        grad_ai = -err + lmbd[1] * a[i]
        grad_bj = -err + lmbd[1] * b[j]
        grad_xi = -err * Y[:,j] + lmbd[2] * X[i,:] * lmbd_corr
        grad_yj = -err * X[i,:] + lmbd[2] * Y[:,j] * lmbd_corr
        
        # TODO Implement Adadelta SGD version. 
        #http://int8.io/comparison-of-optimization-techniques-stochastic-gradient-descent-momentum-adagrad-and-adadelta/#AdaDelta_8211_implementation
        # 
     
        mu[0] += - grad_mu[0] * step[0]
        a[i]  += - grad_ai * step[1]
        b[j]  += - grad_bj * step[1]
        X[i,:] = X[i,:] - grad_xi * step[2]
        Y[:,j] = Y[:,j] - grad_yj * step[2]
    
    for epoch in range(epochs):
        for r in train:
            i,j = r[:2]
            make_step(i, j, 1)
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(train) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not train[(ni + k) % len(train)][1] in uf_visited):                        
                        make_step(prev_id, train[(ni + k) % len(train)][1],0)
                        neg += 1
                    k +=1 
                prev_id = i
                uf_visited = [j]
        
        #calc err
        err1_p ,err2_p, err1_n, err2_n = 0, 0, 0, 0
        for r in test:
            i,j = r[:2]
            s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
            pred = 1/(1+np.exp(-s))
            err1_p += - 1 * np.log(pred) # r[2] - pred
            err2_p += - 1 * np.log(pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i] ** 2 + b[j] ** 2) + lmbd[2] * (X[i,:].dot(X[i,:]*lmbd_corr) + Y[:,j].dot(Y[:,j]*lmbd_corr))
            
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(test) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not test[(ni + k) % len(test)][1] in uf_visited): 
                        i1, j1 = prev_id, train[(ni + k) % len(train)][1]
                        s = mu[0] + a[i1] + b[j1] + X[i1,:].dot(Y[:,j1])
                        pred = 1/(1+np.exp(-s))
                        err1_n += - 1 * np.log(1 - pred) # r[2] - pred
                        err2_n += - 1 * np.log(1 - pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i1] ** 2 + b[j1] ** 2) + lmbd[2] * (X[i1,:].dot(X[i1,:]) + Y[:,j1].dot(Y[:,j1]))                        
                        neg += 1
                    k +=1 
                prev_id = i
                uf_visited = [j]
        err1_t = (err1_p + err1_n) / (len(test) * (neg_sig_share + 1))
        err2_t = (err2_p + err2_n) / (len(test) * (neg_sig_share + 1))
        err1_p /= float(len(test))
        err2_p /= float(len(test))
        err1_n /= len(test) * neg_sig_share
        err2_n /= len(test) * neg_sig_share        
        errors.append(err1_t)
        stat=('{} ' * 9).format(step,lmbd,epoch,err1_t, err1_p,err1_n,err2_t, err2_p,err2_n)
        f.write(stat + '\n')
        print(stat)
        if((len(errors)>6) & (sum(errors[-3:])/3 > sum(errors[-6:-3])/3 - err_thres)): # too fast.
            step = [s * 0.7 for s in step]
        if((len(errors)>13) & (abs(sum(errors[-3:])/3 - sum(errors[-13:-3])/10) < 1e-4)): # too slow.
            step = [s / 0.6 for s in step]
        
        
print('Finish. Work time {}.'.format(datetime.datetime.now()- start_time))
f.write('\nFinish at {}. Work time {}.\n\n\n'.format(datetime.datetime.now(),datetime.datetime.now() - start_time))
f.close()

NameError: name 'train' is not defined

In [5]:

K = 500
#id_cnt = 10000
urlfr_cnt = 1701067 # hc.sql('select max(urlfr_num) from user_kposminin.visits_enum_20160412').collect()[0][0] + 1



err = []
err_thres = 0.005
step, lmbd = [0.0135,0.1353,0.0135], [1e-04,1e-03,1e-02]
f = open('data/collab_filter_SGD_calc.csv','a+')
af = open('data/collab_filter_SGD_a_vec.csv','w')
Xf = open('data/collab_filter_SGD_X_matr.csv','w')
full_train_epochs = 2

mu = np.random.rand(1) - 0.5 - 2 # One-element array to be able to update inside a procedure
b = np.random.rand(urlfr_cnt) - 0.5
Y = np.random.rand(K,urlfr_cnt) - 0.5

lmbd_corr = np.exp(np.arange(-3,3,6./K))

query1 = '''
select id_num,urlfr_num from user_kposminin.visits_enum_20160412 
where id_num between #low and #high
'''

batch_size = 1000

for btc in range(180*10**6/batch_size):

    # Load and parse the data
    sampled_data = hc.sql(query1.replace('#low',str(btc*batch_size)).replace('#high',str((btc + 1) * batch_size - 1))) \
            .collect()
    if(len(sampled_data) == 0): break
    str2write = '{}. Num {}. Sampled data consists of {} rows, {} id and {} uf.\n'.format(datetime.datetime.now(),btc, 
                                len(sampled_data),len(set([e[0] for e in sampled_data])), len(set([e[1] for e in sampled_data])))
    print(str2write)
    f.write(str2write)
    id_start = btc*batch_size
    
    train,test = [], []
    for r in sampled_data:
        if(np.random.rand() < 0.1):
            test.append(r)
        else:
            train.append(r)
    id_cnt = batch_size
    stat = []
    prev_id = -1
    uf_visited= []
    neg_sig_share = 2

    #Init matrices
    a = np.random.rand(id_cnt) - 0.5
    X = np.random.rand(id_cnt, K) - 0.5

    def make_step(i, j, v):
        s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
        pred = 1./(1+np.exp(-s))
        err = v - pred # r[2] - pred
        grad_mu = -err + lmbd[0] * mu
        grad_ai = -err + lmbd[1] * a[i]
        grad_bj = -err + lmbd[1] * b[j]
        grad_xi = -err * Y[:,j] + lmbd[2] * X[i,:] * lmbd_corr
        grad_yj = -err * X[i,:] + lmbd[2] * Y[:,j] * lmbd_corr
        mu[0] += - grad_mu[0] * step[0]
        a[i]  += - grad_ai * step[1]
        b[j]  += - grad_bj * step[1]
        X[i,:] = X[i,:] - grad_xi * step[2]
        Y[:,j] = Y[:,j] - grad_yj * step[2]
    
    for epoch in range(full_train_epochs):
        for r in train:
            i,j = r[0] - id_start, r[1]
            make_step(i, j, 1)
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(train) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not train[(ni + k) % len(train)][1] in uf_visited):                        
                        make_step(prev_id, train[(ni + k) % len(train)][1],0)
                        neg += 1
                    k +=1
                prev_id = i
                uf_visited = [j]
        
        #calc err
        err1_p ,err2_p, err1_n, err2_n = 0, 0, 0, 0
        for r in test:
            i,j = r[0] - id_start, r[1]
            s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
            pred = 1/(1+np.exp(-s))
            err1_p += - 1 * np.log(pred) # r[2] - pred
            err2_p += - 1 * np.log(pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i] ** 2 + b[j] ** 2) + lmbd[2] * (X[i,:].dot(X[i,:]*lmbd_corr) + Y[:,j].dot(Y[:,j]*lmbd_corr))
            
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(test) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not test[(ni + k) % len(test)][1] in uf_visited): 
                        i1, j1 = prev_id, train[(ni + k) % len(train)][1]
                        s = mu[0] + a[i1] + b[j1] + X[i1,:].dot(Y[:,j1])
                        pred = 1/(1+np.exp(-s))
                        err1_n += - 1 * np.log(1 - pred) # r[2] - pred
                        err2_n += - 1 * np.log(1 - pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i] ** 2 + b[j] ** 2) + lmbd[2] * (X[i,:].dot(X[i,:]*lmbd_corr) + Y[:,j].dot(Y[:,j]*lmbd_corr))
                        neg += 1
                    k +=1 
                prev_id = i
                uf_visited = [j]
        err1_t = (err1_p + err1_n) / (len(test) * (neg_sig_share + 1))
        err2_t = (err2_p + err2_n) / (len(test) * (neg_sig_share + 1))
        err1_p /= float(len(test))
        err2_p /= float(len(test))
        err1_n /= len(test) * neg_sig_share
        err2_n /= len(test) * neg_sig_share        
        err.append(err1_t)
        stat=('{} ' * 9).format(step,lmbd,epoch,err1_t, err1_p,err1_n,err2_t, err2_p,err2_n)
        f.write(stat + '\n')
        print(stat)
        if((len(err)>6) & (sum(err[-3:])/3 > sum(err[-6:-3])/3 - err_thres)): # too fast.
            step = [s * 0.7 for s in step]
        if((len(err)>13) & (abs(sum(err[-3:])/3 - sum(err[-13:-3])/10) < 1e-5)): # too slow.
            step = [s / 0.6 for s in step]
                    
        for r in test:
            i,j = r[0] - id_start, r[1]
            make_step(i, j, 1)
        
        l = 0
        for e in a:
            af.write('{},{}\n'.format(id_start + l,a[l]))
            l+=1
        l = 0
        for e in X:
            Xf.write('{},{}\n'.format(id_start + l,','.join(str(z) for z in X[l])))
            l+=1
l=0            
Yf = open('data/collab_filter_SGD_Y_matr.csv','w')
for e in Y:
    Yf.write('{},{}\n'.format(l,','.join(str(z) for z in Y[l])))
    l+=1
l=0            
bf = open('data/collab_filter_SGD_b_vec.csv','w')
for e in b:
    bf.write('{},{}\n'.format(l,b[l]))
    l+=1

f.write('mu is {}'.format(mu))

print('Finish. Work time {}.'.format(datetime.datetime.now()- start_time))
f.write('\nFinish at {}. Work time {}.\n\n\n'.format(datetime.datetime.now(),datetime.datetime.now() - start_time))
f.close()

MemoryError: 