## SVD модель интернет-логов. Выделение тем. ##
Есть таблица интернет-логов  $m_{ij}$, каждая строка отвечает одному интернет-пользователю, каждый столбец - фрагменту урла. $m_{ij}=1$, если посетитель $i$ посетил фрагмент урла $j$ в рамках рассматриваемой сессии, иначе $m_{ij}=0$.

Выделим темы - группировки урлов и пользователей. Для этого воспользуемся модификацией SVD разложения. 

Зафиксируем количество тем $K$. Для каждого пользователя сформируем вектор длины $K$, отражающий степень близости темы к пользователю (которая может быть отрицательной). Получаем матрицу $X_{ik}$. Индекс $i$ отвечает пользователю, $k$ отвечает теме. 

Аналогично сформируем вектор  длины $K$ для каждого фрагмента урла. Получаем матрицу $Y_{kj}$. $k$ отвечает теме, $j$ отвечает фрагменту урла.

Далее для каждого пользователя $i$ введем число $a_i$, которое в целом характеризует его интернет-активность.

Аналогично, элементы вектора $b_j$ отражают популярность урла $j$.

Параметр $\mu$ есть усредняющий скаляр по всей выборке.

Далее интерес пользователя  $i$ к фрагменту урла $j$ будем моделировать величиной $s_{ij} = \mu + a_i + b_j + \sum_k X_{ik} \cdot Y_{kj}$.

Для перевода этой величины в вероятность, воспользуемся логистической функцией: 
$\hat{m}_{ij} = 1/(1+e^{-s_{ij}})$. 

Таким образом, построенная величина $\hat{m}_{ij}$ есть  модельная оценка вероятности посещения пользователем $i$ фрагмента урла $j$.

Введем функцию ошибки $L = \sum_{ij} L_{ij} $,
$$L_{ij} = -m_{ij} \cdot \ln(\hat{m}_{ij}) - (1 - m_{ij}) \cdot \ln(1 - \hat{m}_{ij}) + \lambda_{0} \cdot \mu + \lambda_{1}\cdot (a_i^2 + b_j^2) + \lambda_{2} \cdot \sum_k(X_{ik}^2 + Y_{kj}^2)$$.

Для обучения будем использовать стохастический градиентный спуск.
Следующий шаг - применять  модификацию  градиентного спуска - Adadelta http://int8.io/comparison-of-optimization-techniques-stochastic-gradient-descent-momentum-adagrad-and-adadelta/#AdaDelta_8211_implementation.


#####Конфигурация

In [3]:
#!/usr/bin/env python
#Config
from pyspark import SparkConf, SparkContext, HiveContext
#import re
import numpy as np
import pandas as pd
#import datetime
#from pyspark.mllib.regression import LabeledPoint
#from pyspark.mllib.feature import HashingTF
#from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
#import scipy.sparse as sps
#from pyspark.mllib.linalg import Vectors
#import sklearn
import itertools
import datetime
import sys


#from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

try:
    sc.stop()
except NameError:
    pass

conf = SparkConf().set("spark.executor.instances", 8).set("spark.driver.maxResultSize", "2g")
sc = SparkContext()
hc = HiveContext(sc)
sc.setCheckpointDir('checkpoint/')

####Настройка параметров: $\lambda$ и шага

In [None]:
create_query ='''
create table user_kposminin.urlfr_dict as 
with src as (select * from user_kposminin.visits_ext where ymd = '2016-04-12')

select 
  urlfr, 
  rank() over (order by urlfr) as urlfr_num,
  count(*) as cnt 
from src 
group by urlfr 
having cnt > 500
;

create table user_kposminin.id_dict as 
with src as (select * from user_kposminin.visits_ext where ymd = '2016-04-12')
select 
  id,
  rank() over (order by id) as id_num,
  count(*) as cnt 
from src 
group by urlfr 
having cnt > 10 and cnt < 500
;

create_table user_kposminin.visits_enum_dense_20160412 as
with src as (select * from user_kposminin.visits_ext where ymd = '2016-04-12')
select
  s.id, i.id_num,
  s.urlfr, u.urlfr_num,
  s.cnt
from
  src s
  inner join user_kposminin.urlfr_dict u on s.urlfr = u.urlfr
  inner join user_kposminin.id_dict i on s.id = i.id
where 
u.cnt > 1000
and i.cnt > 30


'''

start_time = datetime.datetime.now()

# Param selection


query = '''
select id_num,urlfr_num from 
(select id_num,urlfr_num,count(*) over (partition by id_num) as uf_cnt,count(*) over (partition by urlfr_num) as id_cnt from user_kposminin.visits_enum_dense_20160412 
where id_num between 0 and 20000 ) a
where id_cnt > 15 and uf_cnt > 15
'''


K = 50
id_cnt = 20001
urlfr_cnt = 384000
epochs = 20


#lmbd = np.array([0.01, 0.01, 0.01])

#step = np.array([0.001, 0.005, 0.01])

param_grid = []

#   [[0.00001, 0.0001, 0.0001],[0.01, 0.05, 0.01]],
for base_step in np.exp(np.arange(-4,0,2)):
    for base_lmbd in np.exp(np.arange(-8,0,2)):
        for var_step in np.array([[0.001,1,1],[0.1,1,0.1]]):
            for var_lmbd in np.array([[0.0001,1,1],[0.01,0.1,1]]):
                param_grid.append([var_step * base_step,var_lmbd * base_lmbd])

lmbd_corr = np.ones(K) #np.exp(np.arange(-3,3,6./K))


# Load and parse the data
sampled_data = hc.sql(query) \
            .collect()
urlfr_cnt1 = len(set([e[1] for e in sampled_data])) + 1
id_cnt1 = len(set([e[0] for e in sampled_data])) + 1
print('Sampled data consists of {} rows, {} id and {} uf.'.format(len(sampled_data),id_cnt1,urlfr_cnt1))

train,test = [], []
for r in sampled_data:
    if(np.random.rand() < 0.1):
        test.append(r)
    else:
        train.append(r)
  
 
# prediction is m^hat_{ij} = 1/(1+exp(-s_{ij})) where s_{ij} = mu + a[i] + b[j] + sum_k (X[i,k] * Y[k,j])
# error function is L_{ij} = -m_{ij}*ln(m^hat_{ij}) - (1 - m_{ij})*ln(1 - m^hat_{ij}) + 
#                            lmbd[0] * mu + lmbd[1]*(a_i^2 + b_j^2) + lmbd[2]*sum_k(X_{ik}^2 + Y_{kj}^2)
f = open('data/collab_filter_SGD_param_tuning.csv','a+')
f.write('\n\n New calc at {}'.format(datetime.datetime.now()))
stat = []
prev_id = -1
uf_visited= []
neg_sig_share = 2
rho = 0.9
eps = 1e-6
errors = []
err_thres = 0.005

for step,lmbd in param_grid: # [[[0.0135,0.1353,0.0135],[1e-07,5e-03,5e-03]]]:

    #Init matrices
    mu = np.random.rand(1) - 0.5 - 1 # One-element array to be able to update inside a procedure
    a = np.random.rand(id_cnt) - 0.5
    b = np.random.rand(urlfr_cnt) - 0.5
    X = np.random.rand(id_cnt, K) - 0.5
    Y = np.random.rand(K,urlfr_cnt) - 0.5

    def make_step(i, j, v):
        s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
        pred = 1./(1+np.exp(-s))
        err = v - pred # r[2] - pred
        grad_mu = -err + lmbd[0] * mu
        grad_ai = -err + lmbd[1] * a[i]
        grad_bj = -err + lmbd[1] * b[j]
        grad_xi = -err * Y[:,j] + lmbd[2] * X[i,:] * lmbd_corr
        grad_yj = -err * X[i,:] + lmbd[2] * Y[:,j] * lmbd_corr
        
        # TODO Implement Adadelta SGD version. 
        #http://int8.io/comparison-of-optimization-techniques-stochastic-gradient-descent-momentum-adagrad-and-adadelta/#AdaDelta_8211_implementation
        # 
     
        mu[0] += - grad_mu[0] * step[0]
        a[i]  += - grad_ai * step[1]
        b[j]  += - grad_bj * step[1]
        X[i,:] = X[i,:] - grad_xi * step[2]
        Y[:,j] = Y[:,j] - grad_yj * step[2]
    
    for epoch in range(epochs):
        for r in train:
            i,j = r[:2]
            make_step(i, j, 1)
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(train) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not train[(ni + k) % len(train)][1] in uf_visited):                        
                        make_step(prev_id, train[(ni + k) % len(train)][1],0)
                        neg += 1
                    k +=1 
                prev_id = i
                uf_visited = [j]
        
        #calc err
        err1_p ,err2_p, err1_n, err2_n = 0, 0, 0, 0
        for r in test:
            i,j = r[:2]
            s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
            pred = 1/(1+np.exp(-s))
            err1_p += - 1 * np.log(pred) # r[2] - pred
            err2_p += - 1 * np.log(pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i] ** 2 + b[j] ** 2) + lmbd[2] * (X[i,:].dot(X[i,:]*lmbd_corr) + Y[:,j].dot(Y[:,j]*lmbd_corr))
            
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(test) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not test[(ni + k) % len(test)][1] in uf_visited): 
                        i1, j1 = prev_id, train[(ni + k) % len(train)][1]
                        s = mu[0] + a[i1] + b[j1] + X[i1,:].dot(Y[:,j1])
                        pred = 1/(1+np.exp(-s))
                        err1_n += - 1 * np.log(1 - pred) # r[2] - pred
                        err2_n += - 1 * np.log(1 - pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i1] ** 2 + b[j1] ** 2) + lmbd[2] * (X[i1,:].dot(X[i1,:]) + Y[:,j1].dot(Y[:,j1]))                        
                        neg += 1
                    k +=1 
                prev_id = i
                uf_visited = [j]
        err1_t = (err1_p + err1_n) / (len(test) * (neg_sig_share + 1))
        err2_t = (err2_p + err2_n) / (len(test) * (neg_sig_share + 1))
        err1_p /= float(len(test))
        err2_p /= float(len(test))
        err1_n /= len(test) * neg_sig_share
        err2_n /= len(test) * neg_sig_share        
        errors.append(err1_t)
        stat=('{} ' * 9).format(step,lmbd,epoch,err1_t, err1_p,err1_n,err2_t, err2_p,err2_n)
        f.write(stat + '\n')
        print(stat)
        if((len(errors)>6) & (sum(errors[-3:])/3 > sum(errors[-6:-3])/3 - err_thres)): # too fast.
            step = [s * 0.7 for s in step]
        if((len(errors)>13) & (abs(sum(errors[-3:])/3 - sum(errors[-13:-3])/10) < 1e-4)): # too slow.
            step = [s / 0.6 for s in step]
        
        
print('Finish. Work time {}.'.format(datetime.datetime.now()- start_time))
f.write('\nFinish at {}. Work time {}.\n\n\n'.format(datetime.datetime.now(),datetime.datetime.now() - start_time))
f.close()

####Выбираем уплотненную подвыбрку логов: пользователей с длиной трека более 40 и фрагментов урлов с более, чем 500 уникальными посетителями в день.
####Обучаемся на этой выборке.
####В силу ограничения по памяти, обучение осуществляется пачками по 10 тыс пользователей с небольшим количеством эпох на пачку. Скорость довольно небольшая: за день обрабатывается ~ 100 пачек.
#### Моделируем 150 тем.


In [18]:
K = 150

full_train_epochs = 8
batch_size = 10000
err_thres = 0.005
step, lmbd = [1e-05,0.015,0.015], [1e-08,3e-04,3e-03]
step = [e/2 for e in step]
#2,2745801053577491e-05,	0,022745801053577486,	0,022745801		2,48E-05	2,48E-04	2,48E-03


urlfr_cnt = hc.sql('select max(urlfr_num) from user_kposminin.visits_enum_dense_20160412').collect()[0][0] + 10
err = []
f = open('data/collab_filter_SGD_calc.csv','a+')
af = open('data/collab_filter_SGD_a_vec.csv','w')
Xf = open('data/collab_filter_SGD_X_matr.csv','w')

#mu = np.random.rand(1) - 0.5 - 2 # One-element array to be able to update inside a procedure
#b = np.random.rand(urlfr_cnt) - 0.5
#Y = np.random.rand(K,urlfr_cnt) - 0.5

lmbd_corr = np.ones(K) #np.exp(np.arange(-3,3,6./K))

query1 = '''
select id_num,urlfr_num from user_kposminin.visits_enum_dense_20160412 
where id_num between #low and #high
'''

for btc in range(1232, 180*10**6/batch_size):

    # Load and parse the data
    train, test = hc.sql(query1.replace('#low',str(btc*batch_size)).replace('#high',str((btc + 1) * batch_size - 1))) \
            .randomSplit([0.9,0.1])
    train = train.collect()
    test  =  test.collect()
    if(len(train) == 0): break
    str2write = '{}. Num {}. Sampled train data consists of {} rows, {} id and {} uf.\n'.format(
                                datetime.datetime.now(),
                                btc, 
                                len(train),len(set([e[0] for e in train])), len(set([e[1] for e in train])))
    print(str2write)
    f.write(str2write)
    id_start = btc*batch_size
    
    id_cnt = batch_size
    stat = []
    prev_id = -1
    uf_visited= []
    neg_sig_share = 2
    cnt = 0
    
    #Init matrices
    a = np.random.rand(id_cnt) - 0.5
    X = np.random.rand(id_cnt, K) - 0.5

    def make_step(i, j, v):
        s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
        pred = 1./(1+np.exp(-s))
        err = v - pred # r[2] - pred
        grad_mu = -err + lmbd[0] * mu
        grad_ai = -err + lmbd[1] * a[i]
        grad_bj = -err + lmbd[1] * b[j]
        grad_xi = -err * Y[:,j] + lmbd[2] * X[i,:] * lmbd_corr
        grad_yj = -err * X[i,:] + lmbd[2] * Y[:,j] * lmbd_corr
        mu[0] += - grad_mu[0] * step[0]
        a[i]  += - grad_ai * step[1]
        b[j]  += - grad_bj * step[1]
        X[i,:] = X[i,:] - grad_xi * step[2]
        Y[:,j] = Y[:,j] - grad_yj * step[2]
    
    for epoch in range(full_train_epochs):
        for r in train:
            i,j = r[0] - id_start, r[1]
            make_step(i, j, 1)
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(train) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not train[(ni + k) % len(train)][1] in uf_visited):                        
                        make_step(prev_id, train[(ni + k) % len(train)][1],0)
                        neg += 1
                    k +=1
                prev_id = i
                uf_visited = [j]
        
        #calc err
        err1_p ,err2_p, err1_n, err2_n = 0, 0, 0, 0
        for r in test:
            i,j = r[0] - id_start, r[1]
            s = mu[0] + a[i] + b[j] + X[i,:].dot(Y[:,j])
            pred = 1/(1+np.exp(-s))
            err1_p += - 1 * np.log(pred) # r[2] - pred
            err2_p += - 1 * np.log(pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i] ** 2 + b[j] ** 2) + lmbd[2] * (X[i,:].dot(X[i,:]*lmbd_corr) + Y[:,j].dot(Y[:,j]*lmbd_corr))
            
            if i == prev_id:
                uf_visited.append(j)
            else:
                # Add negative examples
                ni = np.random.randint(len(test) - len(uf_visited))
                neg, k = 0, 0
                while(neg < int(len(uf_visited) * neg_sig_share)):
                    if(not test[(ni + k) % len(test)][1] in uf_visited): 
                        i1, j1 = prev_id, train[(ni + k) % len(train)][1]
                        s = mu[0] + a[i1] + b[j1] + X[i1,:].dot(Y[:,j1])
                        pred = 1/(1+np.exp(-s))
                        err1_n += - 1 * np.log(1 - pred) # r[2] - pred
                        err2_n += - 1 * np.log(1 - pred) + lmbd[0] * mu[0] + lmbd[1] * (a[i] ** 2 + b[j] ** 2) + lmbd[2] * (X[i,:].dot(X[i,:]*lmbd_corr) + Y[:,j].dot(Y[:,j]*lmbd_corr))
                        neg += 1
                    k +=1 
                prev_id = i
                uf_visited = [j]
        err1_t = (err1_p + err1_n) / (len(test) * (neg_sig_share + 1))
        err2_t = (err2_p + err2_n) / (len(test) * (neg_sig_share + 1))
        err1_p /= float(len(test))
        err2_p /= float(len(test))
        err1_n /= len(test) * neg_sig_share
        err2_n /= len(test) * neg_sig_share        
        err.append(err1_t)
        stat=('{} ' * 9).format(step,lmbd,epoch,err1_t, err1_p,err1_n,err2_t, err2_p,err2_n)
        f.write(stat + '\n')
        print(stat)
        cnt += 1
        if((cnt > 15) & (sum(err[-3:])/3 > sum(err[-6:-3])/3 - err_thres)): # too fast.
            step = [s * 0.6 for s in step]
            cnt = 0 
        #if((len(err) > 15) & (abs(sum(err[-3:])/3 - sum(err[-13:-3])/10) < 1e-5)): # too slow.
        #    step = [s / 0.6 for s in step]
                    
    for r in test:
        i,j = r[0] - id_start, r[1]
        make_step(i, j, 1)
        
    l = 0
    for e in a:
        af.write('{},{}\n'.format(id_start + l,a[l]))
        l+=1
    l = 0
    for e in X:
        Xf.write('{},{}\n'.format(id_start + l,','.join(str(z) for z in X[l])))
        l+=1
    if cnt % 100 == 0:
        l=0            
        with open('data/collab_filter_SGD_Y_matr.csv','w') as Yf:
            for e in Y:
                Yf.write('{},{}\n'.format(l,','.join(str(z) for z in Y[l])))
                l+=1
        l=0            
        with open('data/collab_filter_SGD_b_vec.csv','w') as bf:
            for e in b:
                bf.write('{},{}\n'.format(l,b[l]))
                l+=1
        f.write('\nmu is {}\n'.format(mu))

print('Finish. Work time {}.'.format(datetime.datetime.now() - start_time))
f.write('\nFinish at {}. Work time {}.\n\n\n'.format(datetime.datetime.now(),datetime.datetime.now() - start_time))
f.close()

2016-12-27 18:44:39.554111. Num 1232. Sampled train data consists of 142542 rows, 4416 id and 49612 uf.

[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 0 0.58739426912 1.26380299433 0.249189906514 0.6901225414 1.3664728118 0.351947406202 
[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 1 0.465641131625 0.836960315015 0.279981539931 0.567765295039 0.939167346953 0.382064269082 
[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 2 0.413501393004 0.674962914709 0.282770632152 0.515806304913 0.777324904543 0.385047005098 
[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 3 0.381259426407 0.603427571177 0.270175354022 0.483887718543 0.706058700644 0.372802227492 
[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 4 0.365106809847 0.560707216863 0.267306606339 0.468272037364 0.663811365706 0.370502373194 
[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 5 0.349706487376 0.531336475511 0.258891493308 0.453472428276 0.634971586467 0.362722849181 
[5e-06, 0.0075, 0.0075] [1e-08, 0.0003, 0.003] 6 0.3400850080

KeyboardInterrupt: 

In [122]:
#Save results
l=0
Yf = open('data/collab_filter_SGD_Y_matr.csv','w')
for e in Y:
    Yf.write('{},{}\n'.format(l,','.join(str(z) for z in Y[l])))
    l+=1
l=0            
bf = open('data/collab_filter_SGD_b_vec.csv','w')
for e in b:
    bf.write('{},{}\n'.format(l,b[l]))
    l+=1

f.write('mu is {}'.format(mu))

#print('Finish. Work time {}.'.format(datetime.datetime.now() - start_time))


In [45]:
#[float(e) for e in x.strip().split(',')[1:]]
af = open('data/collab_filter_SGD_a_vec.csv','r')
Xf = open('data/collab_filter_SGD_X_matr.csv','r')
XTdotX = np.zeros([K+2,K+2])

cnt=0
while True:
    x = Xf.readline()
    a = af.readline()
    if not x:
        break
    if cnt % 10000 == 0: print(cnt,x.split(',')[0])
    cnt += 1
    row = [float(a.strip().split(',')[1])+mu[0],1] + [float(e) for e in x.strip().split(',')[1:]]
    for i,j in itertools.product(range(K+2),range(K+2)):
        XTdotX[i,j] += row[i]*row[j]

(0, '12320000')
(10000, '12330000')
(20000, '12340000')
(30000, '12350000')
(40000, '12360000')
(50000, '12370000')
(60000, '12380000')
(70000, '12390000')
(80000, '12400000')
(90000, '12410000')
(100000, '12420000')
(110000, '12430000')
(120000, '12440000')
(130000, '12450000')
(140000, '12460000')
(150000, '12470000')
(160000, '12480000')
(170000, '12490000')
(180000, '12500000')
(190000, '12510000')
(200000, '12520000')
(210000, '12530000')
(220000, '12540000')
(230000, '12550000')
(240000, '12560000')
(250000, '12570000')
(260000, '12580000')
(270000, '12590000')
(280000, '12600000')
(290000, '12610000')
(300000, '12620000')
(310000, '12630000')
(320000, '12640000')
(330000, '12650000')
(340000, '12660000')
(350000, '12670000')
(360000, '12680000')
(370000, '12690000')
(380000, '12700000')
(390000, '12710000')
(400000, '12720000')
(410000, '12730000')
(420000, '12740000')
(430000, '12750000')
(440000, '12760000')
(450000, '12770000')
(460000, '12780000')
(470000, '12790000')
(48000

In [57]:

with open('data/XTdotX.csv','w') as fXT:
    for r in XTdotX:        
        fXT.write(','.join([str(e) for e in r])+'\n')
    

In [24]:
print('mu is {}'.format(mu))
Y.shape
Yb = np.insert(Y, 0, b, axis=0)
Yb = np.insert(Yb, 0, 1, axis=0)

l=0            
Ybf = open('data/collab_filter_SGD_Y_matr_200.csv','w')
for e in Yb:
    Ybf.write('{},{}\n'.format(l,','.join(str(z) for z in Yb[l])))
    l+=1


mu is [-2.13571665]


In [21]:
def cos_dist(Y, i,j):
    return 1 - Y[:,i].dot(Y[:,j]) / (( Y[:,i].dot(Y[:,i]) * Y[:,j].dot(Y[:,j])) ** 0.5 )

In [8]:

uf_dict = hc.sql('select * from user_kposminin.urlfr_dict').toPandas().set_index('urlfr_num')

In [117]:
#uf_dict[uf_dict['urlfr'].map(lambda v: 'gdz' in v)].sort('cnt')
uf_dict[uf_dict['urlfr'] == 'filkos.com#'].cnt.iloc[0]

7536

In [22]:

#uf_dict[uf_dict['urlfr'].map(lambda v: up in v)]
def neighbours(urlfr, Yb, uf_dict, n = 20):
    num = uf_dict[uf_dict['urlfr'] == up].index[0]
    dists1 = sorted([[cos_dist(Yb,num,i),i] for i in range(Yb.shape[1])])
    return [[e[0],uf_dict.loc[e[1],'urlfr']] for e in dists1[:n]]

In [23]:
n = 100
for up in ['raiffeisen.ru#consumerloans','filkos.com#','auto.exist.ru#','vedomosti.ru#business','gdz-putina.me#']:
    print('\n' + '-' * 100 + '\n\nNearest {} neighbours for {} ({} unique visitors per day):\n\n{:<10}\t{:<30}\n'.format(
            n, up, uf_dict[uf_dict['urlfr'] == up].cnt.iloc[0],'distance','urrlfr'))
    print('\n'.join(['{:<10.5f}\t{:<30}'.format(e[0],e[1].encode('utf-8')) for e in neighbours(up, Yb,uf_dict, n = n)]))


----------------------------------------------------------------------------------------------------

Nearest 100 neighbours for raiffeisen.ru#consumerloans (7536 unique visitors per day):

distance  	urrlfr                        

0.00000   	raiffeisen.ru#consumerloans   
0.04707   	raiffeisen.ru#request         
0.04727   	raiffeisen.ru#product         
0.08550   	raiffeisen.ru#retail          
0.20535   	raiffeisen.ru#                
0.21921   	raiffeisen.ru#calculator      
0.26032   	raiffeisen.ru#mortgageloans   
0.27724   	raiffeisen.ru#deposit         
0.28778   	raiffeisen.ru#deposit_investing
0.30226   	raiffeisen.ru#remote_service  
0.30502   	raiffeisen.ru#cards           
0.30734   	raiffeisen.ru#connect         
0.31301   	trust.ru#                     
0.31347   	rusprofile.ru#                
0.31912   	rusprofile.ru#id              
0.31939   	2gis.ru#krasnodar             
0.31963   	fssprus.ru#iss                
0.31982   	raiffeisen.ru#offices         
0.32016  

In [16]:
#print('\n'.join(['{:<10.5f}\t{:<30}'.format(e[0],e[1].encode('utf-8')) for e in neighbours('101.credit#kredity', Yb,uf_dict, 100 )]))
up = '101.credit#kredity'
#uf_dict[uf_dict['urlfr'] == up]
neighbours(up, Yb,uf_dict, n = n)

[[0.0, u'101.credit#kredity'],
 [0.18404546470662886, u'101.credit#articles'],
 [0.32669600985135694, u'101.credit#'],
 [0.49131145910777696, u'na-ladoni.su#'],
 [0.49367114173451188, u'zagranguru.ru#'],
 [0.49771437583809452, u'oktmo2014.ru#OKTMO'],
 [0.49900987598214608, u'migrants.ru#forum'],
 [0.50330818673528199, u'allinform.ru#'],
 [0.50379939392092044, u'nalog-nalog.ru#poryadok_uplaty_nalogov_vznosov'],
 [0.50431218319880466, u'm.9111.ru#'],
 [0.50691837807519802, u'rusmap.net#\u0424\u0438\u0440\u043c\u0430'],
 [0.50700616132484455, u'migrants.ru#'],
 [0.50838290053426793, u'lawyer-consult.ru#'],
 [0.51335024778263838, u'cfo.spr.ru#map'],
 [0.51587377325777883, u'cryptopilot.ru#'],
 [0.51605346028100874, u'cfo.spr.ru#'],
 [0.51743035652719582, u'creditnyi.ru#'],
 [0.52036758469937539, u'voronezhphone.ru#'],
 [0.52050604085754726, u'pravorub.ru#'],
 [0.52164616171006895, u'vladimirskaya-rus.ru#'],
 [0.52284053925940721, u'vse-posobia.ru#'],
 [0.52353606424717491, u'migrants.ru#th

2

In [None]:
af = open('data/collab_filter_SGD_a_vec.csv','w')
Xf = open('data/collab_filter_SGD_X_matr.csv','w')

In [2]:
a = [float(r[1]) for r in [e.split(',') for e in open('data/collab_filter_SGD_a_vec.csv','r').readlines()] if int(r[0]) % 20000 > 10000]

In [3]:
X = [[float(e) for e in r[1:]] for r in [e.split(',') for e in open('data/collab_filter_SGD_X_matr.csv','r').readlines()] if int(r[0]) % 20000 > 10000]

In [5]:
#ra = open('data/collab_filter_SGD_a_vec.csv','r').readlines()


In [6]:
Xa = np.array([[ai+ mu[0],1] + x for ai,x in zip(a,X)])

In [16]:
UTdotU = np.zeros([Xa.shape[1],Xa.shape[1]])
for i in range(UTdotU.shape[1]):
    for j in range(UTdotU.shape[1]):
        UTdotU[i,j] = Xa[i].dot(Xa[j])

In [90]:
import cPickle
cPickle.dump(UTdotU,open('UTdotU.pck','w'))

In [97]:
#b = [float(r[1]) for r in [e.strip().split(',') for e in open('data/collab_filter_SGD_b_vec.csv','r').readlines()[:-1]]]
Y = [[float(e) for e in r[1:]] for r in [e.strip().split(',') for e in open('data/collab_filter_SGD_Y_matr.csv','r').readlines()]]


In [103]:
for j in range(len(Y[-1]),len(Y[1])):
    Y[-1].append(sum([Y[i][j] for i in range(99)])/99.)
#b = b + [sum(b)/len(b)] * (len(Y[0]) - len(b))
Yb = np.array([[1,b[i]] +[Y[j][i] for j in range(len(Y))] for i in range(len(b))])

In [115]:
r = np.dot(Yb,UTdotU)
r1num = np.dot(r[num],Yb.T)

In [118]:
res = sorted(zip(r1num,range(len(r1num))))

In [19]:
sc.stop()

datetime.date(2016, 10, 23)