In [1]:
import numpy as np
import pandas as pd
import math
from multiprocessing import Pool, cpu_count, Process
from multiprocessing import Queue as MQueue
from multiprocessing import Pipe as MPipe
from multiprocessing.pool import ThreadPool


In [44]:
#X, Y are passed without nan values, these values are series with their own indexes,
#you can use dropna in case you have data with nan values
#Y is the data to be processed
#X is the existent data
def euclidean_distance(X, Y):
  result = 0
  n = 0
  for index, value in Y.items():
    if index in X:
      result += (X[index] - value)**2
      n += 1
  result = math.sqrt(result)
  return result, n

def manhattan_distance(X, Y):
  result = 0
  n = 0
  for index, value in Y.items():
      if index in X:
        result += abs(X[index] - value)
        n += 1
  return result,n

def pearson_correlation(X, Y):
  sum_xy = 0
  sum_x = 0
  sum_y = 0
  sum_x2 = 0
  sum_y2 = 0
  n = 0
  for index, value in Y.items():
    if index in X:
        sum_xy += X[index]*value
        sum_x += X[index]
        sum_y += value
        sum_x2 += X[index]**2
        sum_y2 += value**2
        n += 1 
  if n == 0:
    return 0,0
  result = sum_xy - (sum_x*sum_y)/n
  denominator = math.sqrt(sum_x2 - (sum_x**2)/n) *  math.sqrt(sum_y2 - (sum_y**2)/n)
  if denominator == 0:
    return 0,0 #division por cero
  result /= denominator
  return result,n

def cos_similarity(X, Y):
  dot_xy = 0
  mod_x = 0
  mod_y = 0
  n = 0
  for index, value in Y.items():
    if index in X:
      dot_xy += X[index]*value
      mod_x += X[index]**2
      mod_y += value**2
      n += 1
  mod_x = math.sqrt(mod_x)
  mod_y = math.sqrt(mod_y)
  if mod_x == 0 or mod_y == 0:
    return -1, 0 #division por cero
  result = dot_xy/(mod_x * mod_y)
  return result,n

In [47]:
#X_train is a complete dataset, its a dataframe type with its own indexes
#Y is the data that will be compared, its a series type with its own indexes
#k number of neighbours
#min_shared_data min number of same indexes
#distance_fun distance function that will be used as callback
#type_sort = 1 ordena asc, -1 ordena desc
def knn(X_train, Y, k, min_shared_data, distance_func, result, type_sort = 1): 
    columns = X_train.columns #indices de columnas
    neighbours = {} #retorna el indice de la columna como accesor principal con la distancia
    for i in columns:
        val, n = distance_func(X_train.loc[:, i].dropna(), Y.dropna())
        if n < min_shared_data:
            continue
        neighbours[i] = (val,n)
    neighbours_sorted = sorted(neighbours.items(), key=lambda item: (type_sort*item[1][0], -item[1][1]))
    neighbours_sorted = [v for v in neighbours_sorted if v[1][1] != 0]
    result.put(neighbours_sorted[0:k])

In [49]:
def knn_threaded(X_train, Y, k, min_shared_data, distance_func, type_sort = 1): 
    total_processes = cpu_count()
    size_data_splitted = X_train.shape[1]//total_processes
    processes = []
    result = MQueue()
    for i in range(0,X_train.shape[1], size_data_splitted):
        X_train_splitted = X_train.iloc[:,i:i+size_data_splitted-1]
        process = Process(target=knn,args=(X_train_splitted, Y, k, min_shared_data, distance_func, result, type_sort))
        processes.append(process)
        process.start()
    for i in processes:
        i.join()
    print('----------------------------------------------------')
    final_result = []
    while result.empty() == False:
        final_result += result.get()
    final_result.sort(key=lambda i:(type_sort*i[1][0],-i[1][1]))
    print(final_result[0:k])

    

In [5]:
def cos_sim_adj(X, Y):
    X.loc['mean'] = X.mean()
    
    

In [6]:
def read_data(path, sep, header, names):
    df = pd.read_csv(path,sep = sep,header = header,names = names, engine='python')
    return df

def group_by(data, index_row, index_col, val):
    #formatting dataset with rows as value data and columns as index data
    df = data.pivot_table(index = index_row, columns = index_col, values = val)
    return df

In [7]:

df = read_data("./datasets/ratings.dat","::", None, ['userId','movieId','rating','timestamp'])
df_formatted = group_by(df,'movieId', 'userId', 'rating')


In [8]:
a = df_formatted[1].dropna()
b = df_formatted[2].dropna()

In [9]:
c = np.intersect1d(a.index,b.index)
c

array([1193, 1207, 1246, 1962, 2028, 2321, 3105])

In [10]:
a[a.index.isin(c)]

movieId
1193    5.0
1207    4.0
1246    4.0
1962    4.0
2028    5.0
2321    3.0
3105    5.0
Name: 1, dtype: float64

In [11]:
def sim(data, i, j):
    data_i = data.loc[i,:].dropna() #userids for i movie
    data_j = data.loc[j,:].dropna() #userids for j movie
    sum_ij = 0
    sum_i2 = 0
    sum_j2 = 0
    for index, value in data_j.items(): #index is userId
      if index in data_i:
        avg = data.loc['mean', index]
        sum_ij += (data_i[index] - avg)*(data_j[index] - avg)
        sum_i2 += (data_i[index] - avg)**2
        sum_j2 += (data_j[index] - avg)**2
    sum_i2 = math.sqrt(sum_i2)
    sum_j2 = math.sqrt(sum_j2)
    divisor = (sum_i2 * sum_j2)
    if divisor == 0:
      return -2
    result = sum_ij/ divisor
    return result

def get_matrix(df_formatted, i):#i es indice loc de la prediccion
  data = df_formatted.copy()
  data.loc['mean'] = data.mean()
  index = data.index[:-1]
  #crear matrix
  sparse_dict = {}
  #for i in index[:-1]: #movieId indice loc
  temp_dict = {}
  for j in index[-1:0:-1]:
    similarity = sim(data, i, j)
    temp_dict[j] = similarity
  sparse_dict[i]= temp_dict
  return pd.DataFrame(sparse_dict)

matrix = get_matrix(df_formatted, 1)
print(matrix)

KeyboardInterrupt: 

In [53]:
k_neighbours = 10
min_same_rows = 10
neighbours = knn_threaded(df_formatted, df_formatted[5], k_neighbours, min_same_rows, cos_similarity, -1)

----------------------------------------------------
[(5, (1.0, 198)), (5440, (0.9922507703328288, 13)), (2964, (0.9906407141264874, 11)), (2118, (0.9897215120566252, 18)), (5613, (0.9889928320202397, 10)), (2279, (0.9887716439072362, 10)), (1036, (0.988542739980661, 11)), (4909, (0.98829693530373, 10)), (2802, (0.9863944255317857, 21)), (4652, (0.985580761651297, 11))]


In [13]:
k_neighbours = 10
min_same_rows = 10
neighbours = knn_threaded(df_formatted, df_formatted[5], k_neighbours, min_same_rows, manhattan_distance)


----------------------------------------------------
[(5, (0.0, 198)), (2075, (5.0, 12)), (5613, (5.0, 10)), (3862, (6.0, 11)), (358, (6.0, 10)), (1405, (6.0, 10)), (5304, (6.0, 10)), (2464, (6.0, 10)), (1041, (7.0, 14)), (312, (7.0, 12))]
