In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 0 B/88.7 kB 0%] [Connected t                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/u

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark_session = SparkSession.builder.getOrCreate()

# Read Data

In [None]:
transactions = spark_session.read.option("inferSchema","true").csv("drive/MyDrive/All/Data/transactions.csv", header=True, sep="|")

In [None]:
transactions.show(3)

+---------+------+-----+------+-----+
|sessionID|itemID|click|basket|order|
+---------+------+-----+------+-----+
|        0| 21310|    1|     0|    0|
|        1| 73018|    1|     0|    0|
|        2| 19194|    1|     0|    0|
+---------+------+-----+------+-----+
only showing top 3 rows



# Data Preprocessing

### Add interaction level (binary, categorical, and weight based)

In [None]:
from pyspark.sql.functions import when,count

In [None]:
transactions = transactions.withColumn(
    "binary",
    when(transactions["click"] == 1, 1).
    when(transactions["basket"] == 1, 1).
    when(transactions["order"] == 1, 1).
    otherwise(0)
)

In [None]:
transactions = transactions.withColumn(
    "categorical",
    when(transactions["click"] == 1, 1).
    when(transactions["basket"] == 1, 2).
    when(transactions["order"] == 1, 3).
    otherwise(0)
)

In [None]:
transactions = transactions.withColumn(
    "weight",
    (transactions["click"] * 1) + (transactions["basket"] * 2) + (transactions["order"] * 3)
)

In [None]:
transactions.show(10)

+---------+------+-----+------+-----+------+-----------+------+
|sessionID|itemID|click|basket|order|binary|categorical|weight|
+---------+------+-----+------+-----+------+-----------+------+
|        0| 21310|    1|     0|    0|     1|          1|     1|
|        1| 73018|    1|     0|    0|     1|          1|     1|
|        2| 19194|    1|     0|    0|     1|          1|     1|
|        3| 40250|    1|     0|    0|     1|          1|     1|
|        4| 46107|    1|     0|    0|     1|          1|     1|
|        5| 34217|    1|     0|    0|     1|          1|     1|
|        6| 31436|    1|     0|    0|     1|          1|     1|
|        7| 14576|    1|     1|    0|     1|          1|     3|
|        7| 17731|    2|     1|    0|     1|          2|     4|
|        8| 58723|    1|     0|    0|     1|          1|     1|
+---------+------+-----+------+-----+------+-----------+------+
only showing top 10 rows



### Keep only duplicates

In [None]:
transactions_dup = transactions.join(
    transactions.groupBy("sessionID").agg((count("*")>1).cast("int").alias("duplicate")),
    on="sessionID",
    how="inner"
).where("duplicate = 1")

In [None]:
transactions_dup.show()
transactions_dup.count()

+---------+------+-----+------+-----+------+-----------+------+---------+
|sessionID|itemID|click|basket|order|binary|categorical|weight|duplicate|
+---------+------+-----+------+-----+------+-----------+------+---------+
|        7| 14576|    1|     1|    0|     1|          1|     3|        1|
|        7| 17731|    2|     1|    0|     1|          2|     4|        1|
|       12| 30277|    1|     0|    0|     1|          1|     1|        1|
|       12| 29508|    1|     1|    0|     1|          1|     3|        1|
|       12| 75659|    1|     0|    0|     1|          1|     1|        1|
|       13| 55699|    2|     0|    0|     0|          0|     2|        1|
|       13| 62433|    1|     0|    0|     1|          1|     1|        1|
|       13| 72603|    1|     0|    0|     1|          1|     1|        1|
|       14| 11989|    1|     0|    0|     1|          1|     1|        1|
|       14| 11117|    1|     0|    0|     1|          1|     1|        1|
|       20|  3106|    2|     0|    0| 

129501

### Split into 3 datasets

In [None]:
transactions_binary = transactions_dup.select("sessionID","itemID","binary")
transactions_categorical = transactions_dup.select("sessionID","itemID","categorical")
transactions_weight = transactions_dup.select("sessionID","itemID","weight")

In [None]:
transactions_binary.show(3)
transactions_categorical.show(3)
transactions_weight.show(3)

+---------+------+------+
|sessionID|itemID|binary|
+---------+------+------+
|        7| 14576|     1|
|        7| 17731|     1|
|       12| 30277|     1|
+---------+------+------+
only showing top 3 rows

+---------+------+-----------+
|sessionID|itemID|categorical|
+---------+------+-----------+
|        7| 14576|          1|
|        7| 17731|          2|
|       12| 30277|          1|
+---------+------+-----------+
only showing top 3 rows

+---------+------+------+
|sessionID|itemID|weight|
+---------+------+------+
|        7| 14576|     3|
|        7| 17731|     4|
|       12| 30277|     1|
+---------+------+------+
only showing top 3 rows



### Pivot

In [None]:
from pyspark.sql.functions import sum

In [None]:
spark_session.conf.set("spark.sql.pivotMaxValues",25000)

cara pake pivot:
https://stackoverflow.com/questions/46809879/convert-pyspark-groupeddata-object-to-spark-dataframe
1. groupBy = column
2. pivot = row
3. agg = cell -> bisa juga pake .count()

In [None]:
pivot_binary = transactions_binary.limit(100).groupBy("itemID").pivot("sessionID").agg(sum("binary"))

In [None]:
pivot_categorical = transactions_categorical.limit(100).groupBy("itemID").pivot("sessionID").agg(sum("categorical"))

In [None]:
pivot_weight = transactions_weight.limit(100).groupBy("itemID").pivot("sessionID").agg(sum("weight"))

In [None]:
pivot_binary = pivot_binary.fillna(0)

In [None]:
pivot_categorical = pivot_categorical.fillna(0)

In [None]:
pivot_weight = pivot_weight.fillna(0)

In [None]:
pivot_binary.show()

+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|itemID|  7| 12| 13| 14| 20| 21| 28| 49| 52| 56| 63| 67| 77| 79| 88| 97|104|107|110|141|148|169|173|182|194|204|205|215|218|222|228|
+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
| 14576|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
| 17731|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
| 30277|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
| 29508|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
| 75659|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

### Dimensionality Reduction

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

In [None]:
mat_binary = pivot_binary.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_categorical = pivot_categorical.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_weight = pivot_weight.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_binary = RowMatrix(mat_binary)

In [None]:
mat_categorical = RowMatrix(mat_categorical)

In [None]:
mat_weight = RowMatrix(mat_weight)

### Principal component analysis (PCA)

In [None]:
pca_binary = mat_binary.computePrincipalComponents(5)

In [None]:
pca_categorical = mat_categorical.computePrincipalComponents(5)

In [None]:
pca_weight = mat_weight.computePrincipalComponents(5)

In [None]:
projected_binary = mat_binary.multiply(pca_binary)

In [None]:
projected_categorical = mat_binary.multiply(pca_categorical)

In [None]:
projected_weight = mat_binary.multiply(pca_weight)

In [None]:
print(projected_binary.rows.collect())

[DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0326, -0.046, 0.0471, 0.0653, 0.1442]), DenseVector([0.0326, -0.046, 0.0471, 0.0653, 0.1442]), DenseVector([0.0326, -0.046, 0.0471, 0.0653, 0.1442]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0083, -0.0099, 0.0075, 0.0085, 0.014]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0188, -0.024, 0.0204, 0.0244, 0.0433]), DenseVector([0.0516, -0.085, 0.1371, 0.

In [None]:
projected_binary.numRows()

96

In [None]:
projected_binary.numCols()

5

In [None]:
print(projected_categorical.rows.collect())

[DenseVector([0.0047, -0.0237, 0.0174, 0.0388, -0.0]), DenseVector([0.0047, -0.0237, 0.0174, 0.0388, -0.0]), DenseVector([0.0043, -0.0213, 0.015, 0.0268, -0.0]), DenseVector([0.0043, -0.0213, 0.015, 0.0268, -0.0]), DenseVector([0.0043, -0.0213, 0.015, 0.0268, -0.0]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, -0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, -0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, -0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, -0.0]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0013, -0.0064, 0.0044, 0.0068, -0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, -0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, -0.0]), DenseVector([0.0072, -0.038, 0.0305, 0.1544, -0.8321]), DenseVector([0.0072, -0.038, 0.0305, 0.1544, -0.8321]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, 0.0]), DenseVector([0.0027, -0.0135, 0.0094, 0.0155, 0.0]), DenseVector([0.006, -0.0299, 0.0215, 0.0423, -0.0]), Den

In [None]:
print(projected_weight.rows.collect())

[DenseVector([0.0199, -0.0218, 0.0274, -0.0645, 0.0039]), DenseVector([0.0199, -0.0218, 0.0274, -0.0645, 0.0039]), DenseVector([0.0126, -0.0111, 0.0136, -0.0286, 0.0016]), DenseVector([0.0126, -0.0111, 0.0136, -0.0286, 0.0016]), DenseVector([0.0126, -0.0111, 0.0136, -0.0286, 0.0016]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0097, -0.0081, 0.0098, -0.0202, 0.0011]), DenseVector([0.0097, -0.0081, 0.0098, -0.0202, 0.0011]), DenseVector([0.0047, -0.0038, 0.0045, -0.0092, 0.0005]), DenseVector([0.0047, -0.0038, 0.0045, -0.0092, 0.0005]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0072, -0.0059, 0.0072, -0.0148, 0.0008]), DenseVector([0.0047, -0.0038, 0.0045, -0.0092, 0.0005]), DenseVector([0.0047, -0.0038, 0.0045, -0.0092, 0.0005]), DenseVector([0.0098, -0.0084, 0.0102, -0.0212, 0.0012]), DenseVector([0.0098, -0.0084, 0.0102, -0.0212, 0.0012]), DenseVector([0.0047, -0.0038, 0.0045, -0.0092, 0.0005]), DenseVector([0.0047, -0.0038, 0.0045, -0.0092, 0.0005]), DenseVe

### Convert to pandas dataframe

In [None]:
import pandas as pd

In [None]:
training_binary = projected_binary.rows.map(lambda x: x.toArray().tolist()).collect()

In [None]:
training_categorical = projected_categorical.rows.map(lambda x: x.toArray().tolist()).collect()

In [None]:
training_weight = projected_weight.rows.map(lambda x: x.toArray().tolist()).collect()

In [None]:
data_binary = pd.Series(training_binary, pd.MultiIndex.from_product([[x for x in range(96)], list('A')])).unstack()

In [None]:
data_categorical = pd.Series(training_categorical, pd.MultiIndex.from_product([[x for x in range(96)], list('A')])).unstack()

In [None]:
data_weight = pd.Series(training_weight, pd.MultiIndex.from_product([[x for x in range(96)], list('A')])).unstack()

In [None]:
data_binary = data_binary.rename(columns = {'A': 'Features'}, inplace = False)

In [None]:
data_categorical = data_categorical.rename(columns = {'A': 'Features'}, inplace = False)

In [None]:
data_weight = data_weight.rename(columns = {'A': 'Features'}, inplace = False)

In [None]:
data_binary

Unnamed: 0,Features
0,"[0.0188201878758293, -0.02398818051809845, 0.0..."
1,"[0.0188201878758293, -0.02398818051809845, 0.0..."
2,"[0.03264297151709192, -0.04599159338331027, 0...."
3,"[0.03264297151709192, -0.04599159338331027, 0...."
4,"[0.03264297151709192, -0.04599159338331027, 0...."
...,...
91,"[0.0, 0.0, 0.0, 0.0, 0.0]"
92,"[0.0, 0.0, 0.0, 0.0, 0.0]"
93,"[0.008289514159824802, -0.009850326775023061, ..."
94,"[0.0, 0.0, 0.0, 0.0, 0.0]"


In [None]:
transaction_pd = transactions.limit(100).toPandas()

In [None]:
data_binary["itemID"] = transaction_pd["itemID"]

In [None]:
data_categorical["itemID"] = transaction_pd["itemID"]

In [None]:
data_weight["itemID"] = transaction_pd["itemID"]

In [None]:
data_binary

Unnamed: 0,Features,itemID
0,"[0.0188201878758293, -0.02398818051809845, 0.0...",21310
1,"[0.0188201878758293, -0.02398818051809845, 0.0...",73018
2,"[0.03264297151709192, -0.04599159338331027, 0....",19194
3,"[0.03264297151709192, -0.04599159338331027, 0....",40250
4,"[0.03264297151709192, -0.04599159338331027, 0....",46107
...,...,...
91,"[0.0, 0.0, 0.0, 0.0, 0.0]",36339
92,"[0.0, 0.0, 0.0, 0.0, 0.0]",34873
93,"[0.008289514159824802, -0.009850326775023061, ...",34965
94,"[0.0, 0.0, 0.0, 0.0, 0.0]",38171


# Model

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
nearest_neighbors = 6 

#### Knn, Cosine, Brute, p=2 (euclidean distance)

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', p=2)

In [None]:
evaluation = pd.read_csv('drive/MyDrive/All/Data/evaluation.csv', sep='|')

In [None]:
evaluation_binary = pd.concat([evaluation, data_binary])

In [None]:
evaluation_categorical = pd.concat([evaluation, data_categorical])

In [None]:
evaluation_weight = pd.concat([evaluation, data_weight])

##### Binary

In [None]:
result_binary = data_binary

In [None]:
knn.fit(training_binary)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
distances, indices = knn.kneighbors(training_binary, n_neighbors=nearest_neighbors)

In [None]:
distances_binary = pd.DataFrame(distances)

In [None]:
result_binary["d0"] = distances_binary[0]
result_binary["d1"] = distances_binary[1]
result_binary["d2"] = distances_binary[2]
result_binary["d3"] = distances_binary[3]
result_binary["d4"] = distances_binary[4]
result_binary["d5"] = distances_binary[5]

In [None]:
indices_binary = pd.DataFrame(indices)

In [None]:
result_binary["i0"] = indices_binary[0]
result_binary["i1"] = indices_binary[1]
result_binary["i2"] = indices_binary[2]
result_binary["i3"] = indices_binary[3]
result_binary["i4"] = indices_binary[4]
result_binary["i5"] = indices_binary[5]

In [None]:
result_binary

Unnamed: 0,Features,itemID,d0,d1,d2,d3,d4,d5,i0,i1,i2,i3,i4,i5
0,"[0.0188201878758293, -0.02398818051809845, 0.0...",21310,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,17,0,14,15,16,12
1,"[0.0188201878758293, -0.02398818051809845, 0.0...",73018,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,17,0,14,15,16,12
2,"[0.03264297151709192, -0.04599159338331027, 0....",19194,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,89,72,38,37,71,45
3,"[0.03264297151709192, -0.04599159338331027, 0....",40250,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,89,72,38,37,71,45
4,"[0.03264297151709192, -0.04599159338331027, 0....",46107,0.0,0.0,0.000000e+00,0.000000,0.000000,0.000000,89,72,38,37,71,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"[0.0, 0.0, 0.0, 0.0, 0.0]",36339,1.0,1.0,1.000000e+00,1.000000,1.000000,1.000000,66,67,63,64,65,61
92,"[0.0, 0.0, 0.0, 0.0, 0.0]",34873,1.0,1.0,1.000000e+00,1.000000,1.000000,1.000000,66,67,63,64,65,61
93,"[0.008289514159824802, -0.009850326775023061, ...",34965,0.0,0.0,1.110223e-16,0.006619,0.006619,0.006619,11,90,93,6,7,25
94,"[0.0, 0.0, 0.0, 0.0, 0.0]",38171,1.0,1.0,1.000000e+00,1.000000,1.000000,1.000000,66,67,63,64,65,61


In [None]:
for i, row in result_binary.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = result_binary.at[result_binary.at[i, indices_name], "itemID"]
    if recommended_item_id == result_binary.at[i, "itemID"]:
      continue
    else:
      result_binary.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
result_binary["rec_1"] = result_binary["rec_1"].astype('int64')
result_binary["rec_2"] = result_binary["rec_2"].astype('int64')
result_binary["rec_3"] = result_binary["rec_3"].astype('int64')
result_binary["rec_4"] = result_binary["rec_4"].astype('int64')
result_binary["rec_5"] = result_binary["rec_5"].astype('int64')

In [None]:
result_binary[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,62433,29508,75659,55699,2408
1,73018,62433,21310,29508,75659,55699
2,19194,78837,54132,12143,11525,31091
3,40250,78837,54132,12143,11525,31091
4,46107,78837,54132,12143,11525,31091
...,...,...,...,...,...,...
91,36339,10666,62464,1713,69584,63299
92,34873,10666,62464,1713,69584,63299
93,34965,33976,23115,31436,14576,63847
94,38171,10666,62464,1713,69584,63299


Apply to Evaluation

In [None]:
evaluation_binary = evaluation_binary.dropna()

In [None]:
evaluation_binary

Unnamed: 0,itemID,Features
0,21310,"[0.0188201878758293, -0.02398818051809845, 0.0..."
1,73018,"[0.0188201878758293, -0.02398818051809845, 0.0..."
2,19194,"[0.03264297151709192, -0.04599159338331027, 0...."
3,40250,"[0.03264297151709192, -0.04599159338331027, 0...."
4,46107,"[0.03264297151709192, -0.04599159338331027, 0...."
...,...,...
91,36339,"[0.0, 0.0, 0.0, 0.0, 0.0]"
92,34873,"[0.0, 0.0, 0.0, 0.0, 0.0]"
93,34965,"[0.008289514159824802, -0.009850326775023061, ..."
94,38171,"[0.0, 0.0, 0.0, 0.0, 0.0]"


In [None]:
distances, indices = knn.kneighbors(evaluation_binary["Features"].tolist(), n_neighbors=nearest_neighbors)

In [None]:
distances_binary_evaluation = pd.DataFrame(distances)

In [None]:
evaluation_binary["d0"] = distances_binary_evaluation[0]
evaluation_binary["d1"] = distances_binary_evaluation[1]
evaluation_binary["d2"] = distances_binary_evaluation[2]
evaluation_binary["d3"] = distances_binary_evaluation[3]
evaluation_binary["d4"] = distances_binary_evaluation[4]
evaluation_binary["d5"] = distances_binary_evaluation[5]

In [None]:
indices_binary_evaluation = pd.DataFrame(indices)

In [None]:
evaluation_binary["i0"] = indices_binary_evaluation[0]
evaluation_binary["i1"] = indices_binary_evaluation[1]
evaluation_binary["i2"] = indices_binary_evaluation[2]
evaluation_binary["i3"] = indices_binary_evaluation[3]
evaluation_binary["i4"] = indices_binary_evaluation[4]
evaluation_binary["i5"] = indices_binary_evaluation[5]

In [None]:
for i, row in evaluation_binary.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = evaluation_binary.at[evaluation_binary.at[i, indices_name], "itemID"]
    if recommended_item_id == evaluation_binary.at[i, "itemID"]:
      continue
    else:
      evaluation_binary.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
evaluation_binary["rec_1"] = evaluation_binary["rec_1"].astype('int64')
evaluation_binary["rec_2"] = evaluation_binary["rec_2"].astype('int64')
evaluation_binary["rec_3"] = evaluation_binary["rec_3"].astype('int64')
evaluation_binary["rec_4"] = evaluation_binary["rec_4"].astype('int64')
evaluation_binary["rec_5"] = evaluation_binary["rec_5"].astype('int64')

In [None]:
evaluation_binary[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,62433,29508,75659,55699,2408
1,73018,62433,21310,29508,75659,55699
2,19194,78837,54132,12143,11525,31091
3,40250,78837,54132,12143,11525,31091
4,46107,78837,54132,12143,11525,31091
...,...,...,...,...,...,...
91,36339,10666,62464,1713,69584,63299
92,34873,10666,62464,1713,69584,63299
93,34965,33976,23115,31436,14576,63847
94,38171,10666,62464,1713,69584,63299


##### Categorical

In [None]:
result_categorical = data_categorical

In [None]:
knn.fit(training_categorical)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
distances, indices = knn.kneighbors(training_categorical, n_neighbors=nearest_neighbors)

In [None]:
distances_categorical = pd.DataFrame(distances)

In [None]:
result_categorical["d0"] = distances_categorical[0]
result_categorical["d1"] = distances_categorical[1]
result_categorical["d2"] = distances_categorical[2]
result_categorical["d3"] = distances_categorical[3]
result_categorical["d4"] = distances_categorical[4]
result_categorical["d5"] = distances_categorical[5]

In [None]:
indices_categorical = pd.DataFrame(indices)

In [None]:
result_categorical["i0"] = indices_categorical[0]
result_categorical["i1"] = indices_categorical[1]
result_categorical["i2"] = indices_categorical[2]
result_categorical["i3"] = indices_categorical[3]
result_categorical["i4"] = indices_categorical[4]
result_categorical["i5"] = indices_categorical[5]

In [None]:
result_categorical

Unnamed: 0,Features,itemID,d0,d1,d2,d3,d4,d5,i0,i1,i2,i3,i4,i5,recommendation0,recommendation1,recommendation2,recommendation3,recommendation4
0,"[0.004679315969140209, -0.02366515381570171, 0...",21310,0.000000e+00,0.000000e+00,1.110223e-16,1.110223e-16,0.000012,0.000674,25,26,0,1,38,89,63847,3106,73018,12143,78837
1,"[0.004679315969140209, -0.02366515381570171, 0...",73018,0.000000e+00,0.000000e+00,1.110223e-16,1.110223e-16,0.000012,0.000674,25,26,0,1,38,89,63847,3106,21310,12143,78837
2,"[0.004284092604090426, -0.021270099604675824, ...",19194,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,4,72,70,71,2,3,46107,54132,60628,31091,40250
3,"[0.004284092604090426, -0.021270099604675824, ...",40250,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,4,72,70,71,2,3,46107,54132,60628,31091,19194
4,"[0.004284092604090426, -0.021270099604675824, ...",46107,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,4,72,70,71,2,3,54132,60628,31091,19194,40250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"[0.0, 0.0, 0.0, 0.0, 0.0]",36339,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,66,67,63,64,65,61,10666,62464,1713,69584,63299
92,"[0.0, 0.0, 0.0, 0.0, 0.0]",34873,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,66,67,63,64,65,61,10666,62464,1713,69584,63299
93,"[0.0013168106906130735, -0.006438426205241356,...",34965,0.000000e+00,1.110223e-16,1.110223e-16,6.798517e-04,0.000680,0.000680,90,11,93,7,6,35,23115,33976,14576,31436,2417
94,"[0.0, 0.0, 0.0, 0.0, 0.0]",38171,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,66,67,63,64,65,61,10666,62464,1713,69584,63299


In [None]:
for i, row in result_categorical.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = result_categorical.at[result_categorical.at[i, indices_name], "itemID"]
    if recommended_item_id == result_categorical.at[i, "itemID"]:
      continue
    else:
      result_categorical.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
result_categorical["rec_1"] = result_categorical["rec_1"].astype('int64')
result_categorical["rec_2"] = result_categorical["rec_2"].astype('int64')
result_categorical["rec_3"] = result_categorical["rec_3"].astype('int64')
result_categorical["rec_4"] = result_categorical["rec_4"].astype('int64')
result_categorical["rec_5"] = result_categorical["rec_5"].astype('int64')

In [None]:
result_categorical[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,63847,3106,73018,12143,78837
1,73018,63847,3106,21310,12143,78837
2,19194,46107,54132,60628,31091,40250
3,40250,46107,54132,60628,31091,19194
4,46107,54132,60628,31091,19194,40250
...,...,...,...,...,...,...
91,36339,10666,62464,1713,69584,63299
92,34873,10666,62464,1713,69584,63299
93,34965,23115,33976,14576,31436,2417
94,38171,10666,62464,1713,69584,63299


Apply to Evaluation

In [None]:
evaluation_categorical = evaluation_categorical.dropna()

In [None]:
evaluation_categorical

Unnamed: 0,itemID,Features,d0,d1,d2,d3,d4,d5,i0,i1,i2,i3,i4,i5,recommendation0,recommendation1,recommendation2,recommendation3,recommendation4
0,21310,"[0.004679315969140209, -0.02366515381570171, 0...",0.000000e+00,0.000000e+00,1.110223e-16,1.110223e-16,0.000012,0.000674,25.0,26.0,0.0,1.0,38.0,89.0,63847.0,3106.0,73018.0,12143.0,78837.0
1,73018,"[0.004679315969140209, -0.02366515381570171, 0...",0.000000e+00,0.000000e+00,1.110223e-16,1.110223e-16,0.000012,0.000674,25.0,26.0,0.0,1.0,38.0,89.0,63847.0,3106.0,21310.0,12143.0,78837.0
2,19194,"[0.004284092604090426, -0.021270099604675824, ...",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,4.0,72.0,70.0,71.0,2.0,3.0,46107.0,54132.0,60628.0,31091.0,40250.0
3,40250,"[0.004284092604090426, -0.021270099604675824, ...",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,4.0,72.0,70.0,71.0,2.0,3.0,46107.0,54132.0,60628.0,31091.0,19194.0
4,46107,"[0.004284092604090426, -0.021270099604675824, ...",0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,4.0,72.0,70.0,71.0,2.0,3.0,54132.0,60628.0,31091.0,19194.0,40250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,36339,"[0.0, 0.0, 0.0, 0.0, 0.0]",1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,66.0,67.0,63.0,64.0,65.0,61.0,10666.0,62464.0,1713.0,69584.0,63299.0
92,34873,"[0.0, 0.0, 0.0, 0.0, 0.0]",1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,66.0,67.0,63.0,64.0,65.0,61.0,10666.0,62464.0,1713.0,69584.0,63299.0
93,34965,"[0.0013168106906130735, -0.006438426205241356,...",0.000000e+00,1.110223e-16,1.110223e-16,6.798517e-04,0.000680,0.000680,90.0,11.0,93.0,7.0,6.0,35.0,23115.0,33976.0,14576.0,31436.0,2417.0
94,38171,"[0.0, 0.0, 0.0, 0.0, 0.0]",1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,66.0,67.0,63.0,64.0,65.0,61.0,10666.0,62464.0,1713.0,69584.0,63299.0


In [None]:
distances, indices = knn.kneighbors(evaluation_categorical["Features"].tolist(), n_neighbors=nearest_neighbors)

In [None]:
distances_categorical_evaluation = pd.DataFrame(distances)

In [None]:
evaluation_categorical["d0"] = distances_categorical_evaluation[0]
evaluation_categorical["d1"] = distances_categorical_evaluation[1]
evaluation_categorical["d2"] = distances_categorical_evaluation[2]
evaluation_categorical["d3"] = distances_categorical_evaluation[3]
evaluation_categorical["d4"] = distances_categorical_evaluation[4]
evaluation_categorical["d5"] = distances_categorical_evaluation[5]

In [None]:
indices_categorical_evaluation = pd.DataFrame(indices)

In [None]:
evaluation_categorical["i0"] = indices_categorical_evaluation[0]
evaluation_categorical["i1"] = indices_categorical_evaluation[1]
evaluation_categorical["i2"] = indices_categorical_evaluation[2]
evaluation_categorical["i3"] = indices_categorical_evaluation[3]
evaluation_categorical["i4"] = indices_categorical_evaluation[4]
evaluation_categorical["i5"] = indices_categorical_evaluation[5]

In [None]:
for i, row in evaluation_categorical.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = evaluation_categorical.at[evaluation_categorical.at[i, indices_name], "itemID"]
    if recommended_item_id == evaluation_categorical.at[i, "itemID"]:
      continue
    else:
      evaluation_categorical.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
evaluation_categorical["rec_1"] = evaluation_categorical["rec_1"].astype('int64')
evaluation_categorical["rec_2"] = evaluation_categorical["rec_2"].astype('int64')
evaluation_categorical["rec_3"] = evaluation_categorical["rec_3"].astype('int64')
evaluation_categorical["rec_4"] = evaluation_categorical["rec_4"].astype('int64')
evaluation_categorical["rec_5"] = evaluation_categorical["rec_5"].astype('int64')

In [None]:
evaluation_categorical[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,63847,3106,73018,12143,78837
1,73018,63847,3106,21310,12143,78837
2,19194,46107,54132,60628,31091,40250
3,40250,46107,54132,60628,31091,19194
4,46107,54132,60628,31091,19194,40250
...,...,...,...,...,...,...
91,36339,10666,62464,1713,69584,63299
92,34873,10666,62464,1713,69584,63299
93,34965,23115,33976,14576,31436,2417
94,38171,10666,62464,1713,69584,63299


##### Weight

In [None]:
result_weight = data_weight

In [None]:
knn.fit(training_weight)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
distances, indices = knn.kneighbors(training_weight, n_neighbors=nearest_neighbors)

In [None]:
distances_weight = pd.DataFrame(distances)

In [None]:
result_weight["d0"] = distances_weight[0]
result_weight["d1"] = distances_weight[1]
result_weight["d2"] = distances_weight[2]
result_weight["d3"] = distances_weight[3]
result_weight["d4"] = distances_weight[4]
result_weight["d5"] = distances_weight[5]

In [None]:
indices_weight = pd.DataFrame(indices)

In [None]:
result_weight["i0"] = indices_weight[0]
result_weight["i1"] = indices_weight[1]
result_weight["i2"] = indices_weight[2]
result_weight["i3"] = indices_weight[3]
result_weight["i4"] = indices_weight[4]
result_weight["i5"] = indices_weight[5]

In [None]:
result_weight

Unnamed: 0,Features,itemID,d0,d1,d2,d3,d4,d5,i0,i1,i2,i3,i4,i5,recommendation0,recommendation1,recommendation2,recommendation3,recommendation4
0,"[0.019901716346838863, -0.021809848585179446, ...",21310,0.0,0.0,0.000197,1.972064e-04,1.972064e-04,4.631148e-04,0,1,44,43,45,19,73018,5368,25330,46998,11989
1,"[0.019901716346838863, -0.021809848585179446, ...",73018,0.0,0.0,0.000197,1.972064e-04,1.972064e-04,4.631148e-04,0,1,44,43,45,19,21310,5368,25330,46998,11989
2,"[0.012590108183102251, -0.011126922819213007, ...",19194,0.0,0.0,0.000000,1.110223e-16,1.110223e-16,1.110223e-16,3,2,4,72,70,71,40250,46107,54132,60628,31091
3,"[0.012590108183102251, -0.011126922819213007, ...",40250,0.0,0.0,0.000000,1.110223e-16,1.110223e-16,1.110223e-16,3,2,4,72,70,71,19194,46107,54132,60628,31091
4,"[0.012590108183102251, -0.011126922819213007, ...",46107,0.0,0.0,0.000000,1.110223e-16,1.110223e-16,1.110223e-16,3,2,4,72,70,71,40250,19194,54132,60628,31091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"[0.0, 0.0, 0.0, 0.0, 0.0]",36339,1.0,1.0,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,66,67,63,64,65,61,10666,62464,1713,69584,63299
92,"[0.0, 0.0, 0.0, 0.0, 0.0]",34873,1.0,1.0,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,66,67,63,64,65,61,10666,62464,1713,69584,63299
93,"[0.007201185819946992, -0.005947765701020074, ...",34965,0.0,0.0,0.000015,1.458926e-05,1.458926e-05,1.458926e-05,11,93,6,23,22,7,33976,31436,4892,15581,14576
94,"[0.0, 0.0, 0.0, 0.0, 0.0]",38171,1.0,1.0,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,66,67,63,64,65,61,10666,62464,1713,69584,63299


In [None]:
for i, row in result_weight.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = result_weight.at[result_weight.at[i, indices_name], "itemID"]
    if recommended_item_id == result_weight.at[i, "itemID"]:
      continue
    else:
      result_weight.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
result_weight["rec_1"] = result_weight["rec_1"].astype('int64')
result_weight["rec_2"] = result_weight["rec_2"].astype('int64')
result_weight["rec_3"] = result_weight["rec_3"].astype('int64')
result_weight["rec_4"] = result_weight["rec_4"].astype('int64')
result_weight["rec_5"] = result_weight["rec_5"].astype('int64')

In [None]:
result_weight[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,73018,5368,25330,46998,11989
1,73018,21310,5368,25330,46998,11989
2,19194,40250,46107,54132,60628,31091
3,40250,19194,46107,54132,60628,31091
4,46107,40250,19194,54132,60628,31091
...,...,...,...,...,...,...
91,36339,10666,62464,1713,69584,63299
92,34873,10666,62464,1713,69584,63299
93,34965,33976,31436,4892,15581,14576
94,38171,10666,62464,1713,69584,63299


Apply to Evaluation

In [None]:
evaluation_weight = evaluation_weight.dropna()

In [None]:
evaluation_weight

Unnamed: 0,itemID,Features,d0,d1,d2,d3,d4,d5,i0,i1,i2,i3,i4,i5,recommendation0,recommendation1,recommendation2,recommendation3,recommendation4
0,21310,"[0.019901716346838863, -0.021809848585179446, ...",0.0,0.0,0.000197,1.972064e-04,1.972064e-04,4.631148e-04,0.0,1.0,44.0,43.0,45.0,19.0,73018.0,5368.0,25330.0,46998.0,11989.0
1,73018,"[0.019901716346838863, -0.021809848585179446, ...",0.0,0.0,0.000197,1.972064e-04,1.972064e-04,4.631148e-04,0.0,1.0,44.0,43.0,45.0,19.0,21310.0,5368.0,25330.0,46998.0,11989.0
2,19194,"[0.012590108183102251, -0.011126922819213007, ...",0.0,0.0,0.000000,1.110223e-16,1.110223e-16,1.110223e-16,3.0,2.0,4.0,72.0,70.0,71.0,40250.0,46107.0,54132.0,60628.0,31091.0
3,40250,"[0.012590108183102251, -0.011126922819213007, ...",0.0,0.0,0.000000,1.110223e-16,1.110223e-16,1.110223e-16,3.0,2.0,4.0,72.0,70.0,71.0,19194.0,46107.0,54132.0,60628.0,31091.0
4,46107,"[0.012590108183102251, -0.011126922819213007, ...",0.0,0.0,0.000000,1.110223e-16,1.110223e-16,1.110223e-16,3.0,2.0,4.0,72.0,70.0,71.0,40250.0,19194.0,54132.0,60628.0,31091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,36339,"[0.0, 0.0, 0.0, 0.0, 0.0]",1.0,1.0,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,66.0,67.0,63.0,64.0,65.0,61.0,10666.0,62464.0,1713.0,69584.0,63299.0
92,34873,"[0.0, 0.0, 0.0, 0.0, 0.0]",1.0,1.0,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,66.0,67.0,63.0,64.0,65.0,61.0,10666.0,62464.0,1713.0,69584.0,63299.0
93,34965,"[0.007201185819946992, -0.005947765701020074, ...",0.0,0.0,0.000015,1.458926e-05,1.458926e-05,1.458926e-05,11.0,93.0,6.0,23.0,22.0,7.0,33976.0,31436.0,4892.0,15581.0,14576.0
94,38171,"[0.0, 0.0, 0.0, 0.0, 0.0]",1.0,1.0,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,66.0,67.0,63.0,64.0,65.0,61.0,10666.0,62464.0,1713.0,69584.0,63299.0


In [None]:
distances, indices = knn.kneighbors(evaluation_weight["Features"].tolist(), n_neighbors=nearest_neighbors)

In [None]:
distances_weight_evaluation = pd.DataFrame(distances)

In [None]:
evaluation_weight["d0"] = distances_weight_evaluation[0]
evaluation_weight["d1"] = distances_weight_evaluation[1]
evaluation_weight["d2"] = distances_weight_evaluation[2]
evaluation_weight["d3"] = distances_weight_evaluation[3]
evaluation_weight["d4"] = distances_weight_evaluation[4]
evaluation_weight["d5"] = distances_weight_evaluation[5]

In [None]:
indices_weight_evaluation = pd.DataFrame(indices)

In [None]:
evaluation_weight["i0"] = indices_weight_evaluation[0]
evaluation_weight["i1"] = indices_weight_evaluation[1]
evaluation_weight["i2"] = indices_weight_evaluation[2]
evaluation_weight["i3"] = indices_weight_evaluation[3]
evaluation_weight["i4"] = indices_weight_evaluation[4]
evaluation_weight["i5"] = indices_weight_evaluation[5]

In [None]:
for i, row in evaluation_weight.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = evaluation_weight.at[evaluation_weight.at[i, indices_name], "itemID"]
    if recommended_item_id == evaluation_weight.at[i, "itemID"]:
      continue
    else:
      evaluation_weight.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
evaluation_weight["rec_1"] = evaluation_weight["rec_1"].astype('int64')
evaluation_weight["rec_2"] = evaluation_weight["rec_2"].astype('int64')
evaluation_weight["rec_3"] = evaluation_weight["rec_3"].astype('int64')
evaluation_weight["rec_4"] = evaluation_weight["rec_4"].astype('int64')
evaluation_weight["rec_5"] = evaluation_weight["rec_5"].astype('int64')

In [None]:
evaluation_weight[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,73018,5368,25330,46998,11989
1,73018,21310,5368,25330,46998,11989
2,19194,40250,46107,54132,60628,31091
3,40250,19194,46107,54132,60628,31091
4,46107,40250,19194,54132,60628,31091
...,...,...,...,...,...,...
91,36339,10666,62464,1713,69584,63299
92,34873,10666,62464,1713,69584,63299
93,34965,33976,31436,4892,15581,14576
94,38171,10666,62464,1713,69584,63299
