In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"
import findspark
findspark.init()

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:6 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Ign:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark_session = SparkSession.builder.getOrCreate()

# Read Data

In [None]:
transactions = spark_session.read.option("inferSchema","true").csv("drive/MyDrive/All/Data/transactions.csv", header=True, sep="|")

In [None]:
transactions.show(3)

+---------+------+-----+------+-----+
|sessionID|itemID|click|basket|order|
+---------+------+-----+------+-----+
|        0| 21310|    1|     0|    0|
|        1| 73018|    1|     0|    0|
|        2| 19194|    1|     0|    0|
+---------+------+-----+------+-----+
only showing top 3 rows



# Data Preprocessing

### Add interaction level (binary, categorical, and weight based)

In [None]:
from pyspark.sql.functions import when,count

In [None]:
transactions = transactions.withColumn(
    "binary",
    when(transactions["click"] == 1, 1).
    when(transactions["basket"] == 1, 1).
    when(transactions["order"] == 1, 1).
    otherwise(0)
)

In [None]:
transactions = transactions.withColumn(
    "categorical",
    when(transactions["click"] == 1, 1).
    when(transactions["basket"] == 1, 2).
    when(transactions["order"] == 1, 3).
    otherwise(0)
)

In [None]:
transactions = transactions.withColumn(
    "weight",
    (transactions["click"] * 1) + (transactions["basket"] * 2) + (transactions["order"] * 3)
)

In [None]:
transactions.show(10)

+---------+------+-----+------+-----+------+-----------+------+
|sessionID|itemID|click|basket|order|binary|categorical|weight|
+---------+------+-----+------+-----+------+-----------+------+
|        0| 21310|    1|     0|    0|     1|          1|     1|
|        1| 73018|    1|     0|    0|     1|          1|     1|
|        2| 19194|    1|     0|    0|     1|          1|     1|
|        3| 40250|    1|     0|    0|     1|          1|     1|
|        4| 46107|    1|     0|    0|     1|          1|     1|
|        5| 34217|    1|     0|    0|     1|          1|     1|
|        6| 31436|    1|     0|    0|     1|          1|     1|
|        7| 14576|    1|     1|    0|     1|          1|     3|
|        7| 17731|    2|     1|    0|     1|          2|     4|
|        8| 58723|    1|     0|    0|     1|          1|     1|
+---------+------+-----+------+-----+------+-----------+------+
only showing top 10 rows



### Split into 3 datasets

In [None]:
transactions_binary = transactions.select("sessionID","itemID","binary")
transactions_categorical = transactions.select("sessionID","itemID","categorical")
transactions_weight = transactions.select("sessionID","itemID","weight")

In [None]:
transactions_binary.show(3)
transactions_categorical.show(3)
transactions_weight.show(3)

+---------+------+------+
|sessionID|itemID|binary|
+---------+------+------+
|        0| 21310|     1|
|        1| 73018|     1|
|        2| 19194|     1|
+---------+------+------+
only showing top 3 rows

+---------+------+-----------+
|sessionID|itemID|categorical|
+---------+------+-----------+
|        0| 21310|          1|
|        1| 73018|          1|
|        2| 19194|          1|
+---------+------+-----------+
only showing top 3 rows

+---------+------+------+
|sessionID|itemID|weight|
+---------+------+------+
|        0| 21310|     1|
|        1| 73018|     1|
|        2| 19194|     1|
+---------+------+------+
only showing top 3 rows



### Pivot

In [None]:
from pyspark.sql.functions import sum

In [None]:
spark_session.conf.set("spark.sql.pivotMaxValues",25000)

cara pake pivot:
https://stackoverflow.com/questions/46809879/convert-pyspark-groupeddata-object-to-spark-dataframe
1. groupBy = column
2. pivot = row
3. agg = cell -> bisa juga pake .count()

In [None]:
pivot_binary = transactions_binary.limit(100).groupBy("itemID").pivot("sessionID").agg(sum("binary"))

In [None]:
pivot_categorical = transactions_categorical.limit(100).groupBy("itemID").pivot("sessionID").agg(sum("categorical"))

In [None]:
pivot_weight = transactions_weight.limit(100).groupBy("itemID").pivot("sessionID").agg(sum("weight"))

In [None]:
pivot_binary = pivot_binary.fillna(0)

In [None]:
pivot_categorical = pivot_categorical.fillna(0)

In [None]:
pivot_weight = pivot_weight.fillna(0)

In [None]:
pivot_binary.show()

+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|itemID|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 34| 35| 36| 37| 38| 39| 40| 41| 42| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|
+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
| 21310|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

### Dimensionality Reduction

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

In [None]:
mat_binary = pivot_binary.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_categorical = pivot_categorical.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_weight = pivot_weight.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_binary = RowMatrix(mat_binary)

In [None]:
mat_categorical = RowMatrix(mat_categorical)

In [None]:
mat_weight = RowMatrix(mat_weight)

### Principal component analysis (PCA)

In [None]:
pca_binary = mat_binary.computePrincipalComponents(5)

In [None]:
pca_categorical = mat_categorical.computePrincipalComponents(5)

In [None]:
pca_weight = mat_weight.computePrincipalComponents(5)

In [None]:
projected_binary = mat_binary.multiply(pca_binary)

In [None]:
projected_categorical = mat_binary.multiply(pca_categorical)

In [None]:
projected_weight = mat_binary.multiply(pca_weight)

In [None]:
print(projected_binary.rows.collect())

[DenseVector([0.0099, -0.0102, -0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, -0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, -0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, -0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, -0.0]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.044]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.044]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0462, -0.0997, 0.6325, -0.7536, -0.0]), DenseVector([0.0462, -0.0997, 0.6325, -0.7536, -0.0]), DenseVector([0.0462, -0.0997, 0.6325, -0.7536, -0.0]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.3063]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.3063]), DenseVector([0.0241, -0.0311, -

In [None]:
projected_binary.numRows()

98

In [None]:
projected_binary.numCols()

5

In [None]:
print(projected_categorical.rows.collect())

[DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0306, -0.0, 0.0349, 0.0488, 0.0938]), DenseVector([0.0306, -0.0, 0.0349, 0.0488, 0.0938]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0213, -0.0, 0.0228, 0.0285, 0.0292]), DenseVector([0.0213, -0.0, 0.0228, 0.0285, 0.0292]), DenseVector([0.0213, -0.0, 0.0228, 0.0285, 0.0292]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0123, -0.0, 0.013, 0.0157, 0.0145]), DenseVector([0.0123, -0.0, 0.013, 0.0157, 0.0145]), DenseVector([0.0123,

In [None]:
print(projected_weight.rows.collect())

[DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0184, 0.0948, -0.3078, -0.9436]), DenseVector([0.0, -0.0184, 0.0948, -0.3078, -0.9436]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([-0.0, -0.0096, 0.0237, -0.0279, 0.0389]), DenseVector([-0.0, -0.0096, 0.0237, -0.0279, 0.0389]), DenseVector([-0.0, -0.0096, 0.0237, -0.0279, 0.0389]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0, -0.007, 0.0154, -0.017, 0.0226]), DenseVector([0.0, -0.007, 0.0154, -0.017, 0.022

### Convert to pandas dataframe

In [None]:
import pandas as pd

In [None]:
training_binary = projected_binary.rows.map(lambda x: x.toArray().tolist()).collect()

In [None]:
training_categorical = projected_categorical.rows.map(lambda x: x.toArray().tolist()).collect()

In [None]:
training_weight = projected_weight.rows.map(lambda x: x.toArray().tolist()).collect()

In [None]:
data_binary = pd.Series(training_binary, pd.MultiIndex.from_product([[x for x in range(98)], list('A')])).unstack()

In [None]:
data_categorical = pd.Series(training_categorical, pd.MultiIndex.from_product([[x for x in range(98)], list('A')])).unstack()

In [None]:
data_weight = pd.Series(training_weight, pd.MultiIndex.from_product([[x for x in range(98)], list('A')])).unstack()

In [None]:
data_binary = data_binary.rename(columns = {'A': 'Features'}, inplace = False)

In [None]:
data_categorical = data_categorical.rename(columns = {'A': 'Features'}, inplace = False)

In [None]:
data_weight = data_weight.rename(columns = {'A': 'Features'}, inplace = False)

In [None]:
data_binary

Unnamed: 0,Features
0,"[0.009906425417814904, -0.010160218690672448, ..."
1,"[0.009906425417815501, -0.010160218690672684, ..."
2,"[0.009906425417815029, -0.01016021869067163, 1..."
3,"[0.009906425417814786, -0.010160218690672851, ..."
4,"[0.009906425417814784, -0.010160218690672268, ..."
...,...
93,"[0.009906425417814778, -0.010160218690672447, ..."
94,"[0.009906425417814778, -0.010160218690672447, ..."
95,"[0.009906425417814778, -0.010160218690672448, ..."
96,"[0.009906425417814778, -0.010160218690672443, ..."


In [None]:
transaction_pd = transactions.limit(100).toPandas()

In [None]:
data_binary["itemID"] = transaction_pd["itemID"]

In [None]:
data_categorical["itemID"] = transaction_pd["itemID"]

In [None]:
data_weight["itemID"] = transaction_pd["itemID"]

In [None]:
data_binary

Unnamed: 0,Features,itemID
0,"[0.009906425417814904, -0.010160218690672448, ...",21310
1,"[0.009906425417815501, -0.010160218690672684, ...",73018
2,"[0.009906425417815029, -0.01016021869067163, 1...",19194
3,"[0.009906425417814786, -0.010160218690672851, ...",40250
4,"[0.009906425417814784, -0.010160218690672268, ...",46107
...,...,...
93,"[0.009906425417814778, -0.010160218690672447, ...",34965
94,"[0.009906425417814778, -0.010160218690672447, ...",38171
95,"[0.009906425417814778, -0.010160218690672448, ...",33208
96,"[0.009906425417814778, -0.010160218690672443, ...",40673


# Model

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
nearest_neighbors = 6 

#### Knn, Cosine, Brute, p=2 (euclidean distance)

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', p=2)

In [None]:
evaluation = pd.read_csv('drive/MyDrive/All/Data/evaluation.csv', sep='|')

In [None]:
evaluation_binary = pd.concat([evaluation, data_binary])

In [None]:
evaluation_categorical = pd.concat([evaluation, data_categorical])

In [None]:
evaluation_weight = pd.concat([evaluation, data_weight])

##### Binary

In [None]:
result_binary = data_binary

In [None]:
knn.fit(training_binary)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
distances, indices = knn.kneighbors(training_binary, n_neighbors=nearest_neighbors)

In [None]:
distances_binary = pd.DataFrame(distances)

In [None]:
result_binary["d0"] = distances_binary[0]
result_binary["d1"] = distances_binary[1]
result_binary["d2"] = distances_binary[2]
result_binary["d3"] = distances_binary[3]
result_binary["d4"] = distances_binary[4]
result_binary["d5"] = distances_binary[5]

In [None]:
indices_binary = pd.DataFrame(indices)

In [None]:
result_binary["i0"] = indices_binary[0]
result_binary["i1"] = indices_binary[1]
result_binary["i2"] = indices_binary[2]
result_binary["i3"] = indices_binary[3]
result_binary["i4"] = indices_binary[4]
result_binary["i5"] = indices_binary[5]

In [None]:
pd.set_option("display.max_rows",None)
result_binary

Unnamed: 0,Features,itemID,d0,d1,d2,d3,d4,d5,i0,i1,i2,i3,i4,i5,rec_1,rec_2,rec_3,rec_4,rec_5
0,"[0.009906425417814904, -0.010160218690672448, ...",21310,0.0,0.0,0.0,0.0,0.0,0.0,34,72,38,71,35,66,23197,54132,12143,31091,2417.0
1,"[0.009906425417815501, -0.010160218690672684, ...",73018,0.0,0.0,0.0,0.0,0.0,0.0,55,59,54,52,56,60,59843,66314,65634,72543,71896.0
2,"[0.009906425417815029, -0.01016021869067163, 1...",19194,0.0,0.0,0.0,0.0,0.0,0.0,34,72,67,71,35,39,23197,54132,62464,31091,2417.0
3,"[0.009906425417814786, -0.010160218690672851, ...",40250,0.0,0.0,0.0,0.0,0.0,0.0,55,59,54,51,56,60,59843,66314,65634,30953,71896.0
4,"[0.009906425417814784, -0.010160218690672268, ...",46107,0.0,0.0,0.0,0.0,0.0,0.0,55,59,54,51,56,60,59843,66314,65634,30953,71896.0
5,"[0.009906425417814784, -0.010160218690672336, ...",34217,0.0,0.0,0.0,0.0,0.0,0.0,55,59,54,52,56,60,59843,66314,65634,72543,71896.0
6,"[0.009906425417814788, -0.010160218690672438, ...",31436,0.0,0.0,0.0,0.0,0.0,0.0,55,59,54,51,56,60,59843,66314,65634,30953,71896.0
7,"[0.024113344539166547, -0.03112402558836884, -...",14576,2.220446e-16,2.220446e-16,0.009285868,0.009286,0.082039,0.082039,7,8,19,20,37,36,17731,11989,11117,11525,57800.0
8,"[0.024113344539166547, -0.03112402558836884, -...",17731,2.220446e-16,2.220446e-16,0.009285868,0.009286,0.082039,0.082039,7,8,19,20,37,36,14576,11989,11117,11525,57800.0
9,"[0.009906425417814788, -0.010160218690672433, ...",58723,0.0,0.0,0.0,0.0,0.0,0.0,71,39,35,38,34,72,31091,42285,2417,12143,23197.0


In [None]:
for i, row in result_binary.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = result_binary.at[result_binary.at[i, indices_name], "itemID"]
    if recommended_item_id == result_binary.at[i, "itemID"]:
      continue
    else:
      result_binary.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
result_binary["rec_1"] = result_binary["rec_1"].astype('int64')
result_binary["rec_2"] = result_binary["rec_2"].astype('int64')
result_binary["rec_3"] = result_binary["rec_3"].astype('int64')
result_binary["rec_4"] = result_binary["rec_4"].astype('int64')
result_binary["rec_5"] = result_binary["rec_5"].astype('int64')

ValueError: ignored

In [None]:
result_binary[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,21310,23197,54132,12143,31091,2417.0
1,73018,59843,66314,65634,72543,71896.0
2,19194,23197,54132,62464,31091,2417.0
3,40250,59843,66314,65634,30953,71896.0
4,46107,59843,66314,65634,30953,71896.0
5,34217,59843,66314,65634,72543,71896.0
6,31436,59843,66314,65634,30953,71896.0
7,14576,17731,11989,11117,11525,57800.0
8,17731,14576,11989,11117,11525,57800.0
9,58723,31091,42285,2417,12143,23197.0


Apply to Evaluation

In [None]:
evaluation_binary = evaluation_binary.dropna()

In [None]:
evaluation_binary

In [None]:
distances, indices = knn.kneighbors(evaluation_binary["Features"].tolist(), n_neighbors=nearest_neighbors)

In [None]:
distances_binary_evaluation = pd.DataFrame(distances)

In [None]:
evaluation_binary["d0"] = distances_binary_evaluation[0]
evaluation_binary["d1"] = distances_binary_evaluation[1]
evaluation_binary["d2"] = distances_binary_evaluation[2]
evaluation_binary["d3"] = distances_binary_evaluation[3]
evaluation_binary["d4"] = distances_binary_evaluation[4]
evaluation_binary["d5"] = distances_binary_evaluation[5]

In [None]:
indices_binary_evaluation = pd.DataFrame(indices)

In [None]:
evaluation_binary["i0"] = indices_binary_evaluation[0]
evaluation_binary["i1"] = indices_binary_evaluation[1]
evaluation_binary["i2"] = indices_binary_evaluation[2]
evaluation_binary["i3"] = indices_binary_evaluation[3]
evaluation_binary["i4"] = indices_binary_evaluation[4]
evaluation_binary["i5"] = indices_binary_evaluation[5]

In [None]:
for i, row in evaluation_binary.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = evaluation_binary.at[evaluation_binary.at[i, indices_name], "itemID"]
    if recommended_item_id == evaluation_binary.at[i, "itemID"]:
      continue
    else:
      evaluation_binary.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
evaluation_binary["rec_1"] = evaluation_binary["rec_1"].astype('int64')
evaluation_binary["rec_2"] = evaluation_binary["rec_2"].astype('int64')
evaluation_binary["rec_3"] = evaluation_binary["rec_3"].astype('int64')
evaluation_binary["rec_4"] = evaluation_binary["rec_4"].astype('int64')
evaluation_binary["rec_5"] = evaluation_binary["rec_5"].astype('int64')

In [None]:
evaluation_binary[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

##### Categorical

In [None]:
result_categorical = data_categorical

In [None]:
knn.fit(training_categorical)

In [None]:
distances, indices = knn.kneighbors(training_categorical, n_neighbors=nearest_neighbors)

In [None]:
distances_categorical = pd.DataFrame(distances)

In [None]:
result_categorical["d0"] = distances_categorical[0]
result_categorical["d1"] = distances_categorical[1]
result_categorical["d2"] = distances_categorical[2]
result_categorical["d3"] = distances_categorical[3]
result_categorical["d4"] = distances_categorical[4]
result_categorical["d5"] = distances_categorical[5]

In [None]:
indices_categorical = pd.DataFrame(indices)

In [None]:
result_categorical["i0"] = indices_categorical[0]
result_categorical["i1"] = indices_categorical[1]
result_categorical["i2"] = indices_categorical[2]
result_categorical["i3"] = indices_categorical[3]
result_categorical["i4"] = indices_categorical[4]
result_categorical["i5"] = indices_categorical[5]

In [None]:
result_categorical

In [None]:
for i, row in result_categorical.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = result_categorical.at[result_categorical.at[i, indices_name], "itemID"]
    if recommended_item_id == result_categorical.at[i, "itemID"]:
      continue
    else:
      result_categorical.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
result_categorical["rec_1"] = result_categorical["rec_1"].astype('int64')
result_categorical["rec_2"] = result_categorical["rec_2"].astype('int64')
result_categorical["rec_3"] = result_categorical["rec_3"].astype('int64')
result_categorical["rec_4"] = result_categorical["rec_4"].astype('int64')
result_categorical["rec_5"] = result_categorical["rec_5"].astype('int64')

In [None]:
result_categorical[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Apply to Evaluation

In [None]:
evaluation_categorical = evaluation_categorical.dropna()

In [None]:
evaluation_categorical

In [None]:
distances, indices = knn.kneighbors(evaluation_categorical["Features"].tolist(), n_neighbors=nearest_neighbors)

In [None]:
distances_categorical_evaluation = pd.DataFrame(distances)

In [None]:
evaluation_categorical["d0"] = distances_categorical_evaluation[0]
evaluation_categorical["d1"] = distances_categorical_evaluation[1]
evaluation_categorical["d2"] = distances_categorical_evaluation[2]
evaluation_categorical["d3"] = distances_categorical_evaluation[3]
evaluation_categorical["d4"] = distances_categorical_evaluation[4]
evaluation_categorical["d5"] = distances_categorical_evaluation[5]

In [None]:
indices_categorical_evaluation = pd.DataFrame(indices)

In [None]:
evaluation_categorical["i0"] = indices_categorical_evaluation[0]
evaluation_categorical["i1"] = indices_categorical_evaluation[1]
evaluation_categorical["i2"] = indices_categorical_evaluation[2]
evaluation_categorical["i3"] = indices_categorical_evaluation[3]
evaluation_categorical["i4"] = indices_categorical_evaluation[4]
evaluation_categorical["i5"] = indices_categorical_evaluation[5]

In [None]:
for i, row in evaluation_categorical.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = evaluation_categorical.at[evaluation_categorical.at[i, indices_name], "itemID"]
    if recommended_item_id == evaluation_categorical.at[i, "itemID"]:
      continue
    else:
      evaluation_categorical.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
evaluation_categorical["rec_1"] = evaluation_categorical["rec_1"].astype('int64')
evaluation_categorical["rec_2"] = evaluation_categorical["rec_2"].astype('int64')
evaluation_categorical["rec_3"] = evaluation_categorical["rec_3"].astype('int64')
evaluation_categorical["rec_4"] = evaluation_categorical["rec_4"].astype('int64')
evaluation_categorical["rec_5"] = evaluation_categorical["rec_5"].astype('int64')

In [None]:
evaluation_categorical[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

##### Weight

In [None]:
result_weight = data_weight

In [None]:
knn.fit(training_weight)

In [None]:
distances, indices = knn.kneighbors(training_weight, n_neighbors=nearest_neighbors)

In [None]:
distances_weight = pd.DataFrame(distances)

In [None]:
result_weight["d0"] = distances_weight[0]
result_weight["d1"] = distances_weight[1]
result_weight["d2"] = distances_weight[2]
result_weight["d3"] = distances_weight[3]
result_weight["d4"] = distances_weight[4]
result_weight["d5"] = distances_weight[5]

In [None]:
indices_weight = pd.DataFrame(indices)

In [None]:
result_weight["i0"] = indices_weight[0]
result_weight["i1"] = indices_weight[1]
result_weight["i2"] = indices_weight[2]
result_weight["i3"] = indices_weight[3]
result_weight["i4"] = indices_weight[4]
result_weight["i5"] = indices_weight[5]

In [None]:
result_weight

In [None]:
for i, row in result_weight.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = result_weight.at[result_weight.at[i, indices_name], "itemID"]
    if recommended_item_id == result_weight.at[i, "itemID"]:
      continue
    else:
      result_weight.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
result_weight["rec_1"] = result_weight["rec_1"].astype('int64')
result_weight["rec_2"] = result_weight["rec_2"].astype('int64')
result_weight["rec_3"] = result_weight["rec_3"].astype('int64')
result_weight["rec_4"] = result_weight["rec_4"].astype('int64')
result_weight["rec_5"] = result_weight["rec_5"].astype('int64')

In [None]:
result_weight[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]

Apply to Evaluation

In [None]:
evaluation_weight = evaluation_weight.dropna()

In [None]:
evaluation_weight

In [None]:
distances, indices = knn.kneighbors(evaluation_weight["Features"].tolist(), n_neighbors=nearest_neighbors)

In [None]:
distances_weight_evaluation = pd.DataFrame(distances)

In [None]:
evaluation_weight["d0"] = distances_weight_evaluation[0]
evaluation_weight["d1"] = distances_weight_evaluation[1]
evaluation_weight["d2"] = distances_weight_evaluation[2]
evaluation_weight["d3"] = distances_weight_evaluation[3]
evaluation_weight["d4"] = distances_weight_evaluation[4]
evaluation_weight["d5"] = distances_weight_evaluation[5]

In [None]:
indices_weight_evaluation = pd.DataFrame(indices)

In [None]:
evaluation_weight["i0"] = indices_weight_evaluation[0]
evaluation_weight["i1"] = indices_weight_evaluation[1]
evaluation_weight["i2"] = indices_weight_evaluation[2]
evaluation_weight["i3"] = indices_weight_evaluation[3]
evaluation_weight["i4"] = indices_weight_evaluation[4]
evaluation_weight["i5"] = indices_weight_evaluation[5]

In [None]:
for i, row in evaluation_weight.iterrows():
  recommendation_count = 1
  for j in range(6):
    if recommendation_count == 6:
      break
    
    indices_name = "i" + str(j)
    column_name = "rec_" + str(recommendation_count)
    recommended_item_id = evaluation_weight.at[evaluation_weight.at[i, indices_name], "itemID"]
    if recommended_item_id == evaluation_weight.at[i, "itemID"]:
      continue
    else:
      evaluation_weight.at[i,column_name] = recommended_item_id
      recommendation_count = recommendation_count + 1

In [None]:
evaluation_weight["rec_1"] = evaluation_weight["rec_1"].astype('int64')
evaluation_weight["rec_2"] = evaluation_weight["rec_2"].astype('int64')
evaluation_weight["rec_3"] = evaluation_weight["rec_3"].astype('int64')
evaluation_weight["rec_4"] = evaluation_weight["rec_4"].astype('int64')
evaluation_weight["rec_5"] = evaluation_weight["rec_5"].astype('int64')

In [None]:
evaluation_weight[["itemID","rec_1","rec_2","rec_3","rec_4","rec_5"]]