In [1]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/27 10:55:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/27 10:55:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
train_sessions_engineered = spark.read.load('../Data/session_engineered_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()
columns_df = train_sessions_engineered.columns

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')


22/05/27 10:55:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

#### Data Normalization ([0,...,1])

In [3]:
summary = train_sessions_engineered.describe().toPandas().set_index('summary').transpose()
#print(summary)
columns_to_normalize = ['session_time','most_time_spent_on_item','number_o_visit','mean_time','number_o_revisited_items','std_time']+[str(i) for i in range(1,25)]+['_45']
dic_val = {}
for elem in columns_to_normalize:
    index = np.where(np.array(train_sessions_engineered.columns) == elem)[0][0]
    row = summary.iloc[index]
    dic_val[index] = (float(row[3]),float(row[4]))
print(dic_val)

                                                                                

{1: (0.0, 86006.0), 7: (0.0, 85551.0), 13: (-1.0, 97.0), 10: (0.0, 85551.0), 14: (0.0, 44.0), 11: (0.0, 42422.5), 19: (0.0, 9.0), 20: (0.0, 1.0), 21: (0.0, 1.0), 22: (0.0, 1.0), 23: (0.0, 1.0), 24: (0.0, 1.0), 25: (0.0, 1.0), 26: (0.0, 1.0), 27: (0.0, 1.0), 28: (0.0, 1.0), 29: (0.0, 1.0), 30: (0.0, 1.0), 31: (0.0, 1.0), 32: (0.0, 1.0), 33: (0.0, 1.0), 34: (0.0, 1.0), 35: (0.0, 1.0), 36: (0.0, 1.0), 37: (0.0, 1.0), 38: (0.0, 1.0), 39: (0.0, 1.0), 40: (0.0, 1.0), 41: (0.0, 1.0), 42: (0.0, 1.0), 44: (0.0, 1.0)}


In [4]:
broadcast_clusters = sc.broadcast(dic_val)
def normalize_column(row, br):
    dic = br.value
    dickeys = dic.keys()
    temp = []
    for i in range(45):
        if i in dic:
            elem = dic.get(i)
            temp.append((row[i]-elem[0])/elem[1])
        else:
            temp.append(row[i])
    return tuple(temp)
print(normalize_column(train_sessions_engineered.take(1)[0], broadcast_clusters))
train_sessions_engineered = train_sessions_engineered.rdd.map(lambda x: normalize_column(x,broadcast_clusters)).toDF(columns_df)

(28560.0, 0.003162570053252099, 3.0, 2.0, 8.0, 0.0, 5.0, 0.0008532921882853503, 28.0, 25.0, 0.0005552243690897827, 0.00047021861685854537, -1.0, 0.0, 0.0, 5.0, 23.0, 26.0, 5.0, 0.4444444444444444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1666666716337204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8333333134651184, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


                                                                                

#### Keep only useful columns

In [5]:
mrmr = [0, 1, 2, 5, 11, 15, 16, 17, 19, 23, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42]
columns_to_keep = [train_sessions_engineered.columns[ind] for ind in mrmr] 
print(columns_to_keep)

for col in train_sessions_engineered.columns:
    if not col in columns_to_keep:
        train_sessions_engineered = train_sessions_engineered.drop(col)

['session_id', 'session_time', 'season', 'year', 'std_time', 'most_frequently_bought_for_most_revisited', 'first_item_visited', 'last_item_visited', '1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']


In [6]:
train_df = train_sessions_engineered.join(train_purchases,train_sessions_engineered.session_id == train_purchases.session_id,"inner" ).cache()
for col in ['session_id','date']:
    train_df = train_df.drop(col)

In [7]:
train_df.take(1)

                                                                                

[Row(session_time=0.0045810757389019375, season=1.0, year=1.0, std_time=0.0010303404470445186, most_frequently_bought_for_most_revisited=5.0, first_item_visited=23.0, last_item_visited=23.0, 1=0.2222222222222222, 5=0.0, 8=0.0, 9=0.0, 10=0.0, 11=0.0, 15=0.0, 16=0.0, 18=0.0, 19=0.0, 21=0.0, 22=0.0, 24=0.0, item_id=1640)]

In [None]:
[0,1,2,4,10,14,15,16,18] ->

#### One Hot Encoding

In [8]:
dic_ohe = {0:4,1:4,4:50,5:50,6:50}
broadcast_d = sc.broadcast(dic_ohe)
# one hot encode all the rows
def ohe(x,bd):
    temp = []
    dic = bd.value
    for i in range(0,len(x)):
        if i in dic:
            new_r = [0]*dic[i]
            if x[i] != -1:
                new_r[int(x[i])] = 1
            temp += new_r
        else:
            temp.append(x[i])
    return tuple(temp)

temp = train_df.rdd.map(lambda x : ohe(x,broadcast_d)).cache()

In [9]:
len(temp.take(1)[0])

22/05/27 10:59:50 WARN BlockManager: Task 26 already completed, not releasing lock for rdd_74_0
                                                                                

174

#### Create training df

In [10]:
training_data, testing_data = temp.randomSplit([0.9, 0.1])
sampled_training_data, _ = training_data.randomSplit([0.2,0.8])

In [11]:
#training_data.take(1)

# KNN

In [20]:
def pred_knn(data,input_row):
    distance_set = data.map(lambda x : (np.sum(np.abs(np.subtract(x[:-1], input_row))),x[-1])).filter(lambda x : x[0] < 20)
    sorted_set = distance_set.sortBy(lambda x: x[0], ascending=False)
    return(sorted_set.take(100))

true_label = testing_data.take(1)[0][-1]

pred_test = pred_knn(sampled_training_data,testing_data.take(1)[0][:-1])

22/05/27 11:02:46 WARN BlockManager: Task 1234 already completed, not releasing lock for rdd_74_0
                                                                                

In [21]:
def score(top_100,label):
    score = 0
    for count, elem in enumerate(top_100):
        if elem[1] == label:
            score = (100-count)/100
            return score
    return score

In [22]:
score(pred_test,true_label)

0

In [23]:
total_score = []
tests = testing_data.take(10)
for row in tests:
    total_score.append(score(pred_knn(sampled_training_data,row[:-1]),row[-1]))

                                                                                

In [24]:
np.mean(total_score)

0.0

In [37]:
pred = np.array([10785., 15542., 27202., 12518., 24465., 15844., 15593., 19372.,
       21675., 17715.,   485., 10111., 24887., 14379.,  1641.,  5272.,
       10304.,   409., 10382.,  8807., 11249.,  3679., 17164., 10724.,
        5724., 21040.,  3334., 17585.,  9538.,  5570.,  1592., 12872.,
       20008., 14652.,  4990., 22736., 11313., 21160.,  4747., 14539.,
       22840., 23293., 10945., 15121., 23657.,  5369.,  4899.,  8434.,
       20724.,  4296., 20604., 17864., 25632., 16029., 12912., 19320.,
       10278., 26258.,  1513., 17005., 19757.,  3720.,  9344., 16441.,
       21001., 24888., 22811., 27086., 27989., 14988.,  5282., 13746.,
       13537., 10295.,  6157., 21819., 18258., 22331.,  3966.,  8017.,
        6762., 20258., 27937., 25308., 27403., 25081., 15188., 14160.,
       14451., 26373., 20083.,  1960., 13955., 16368., 17461.,  3319.,
        3184., 27819., 10976., 10605.])
rank = np.arange(100,0,-1)
id_ = [1238]*100

In [58]:
p = [list(id_),list(pred),list(rank)]
inp = list(map(list, zip(*p)))
print(inp)#spark.createDataFrame(inp,]).show()

[[1238, 10785.0, 100], [1238, 15542.0, 99], [1238, 27202.0, 98], [1238, 12518.0, 97], [1238, 24465.0, 96], [1238, 15844.0, 95], [1238, 15593.0, 94], [1238, 19372.0, 93], [1238, 21675.0, 92], [1238, 17715.0, 91], [1238, 485.0, 90], [1238, 10111.0, 89], [1238, 24887.0, 88], [1238, 14379.0, 87], [1238, 1641.0, 86], [1238, 5272.0, 85], [1238, 10304.0, 84], [1238, 409.0, 83], [1238, 10382.0, 82], [1238, 8807.0, 81], [1238, 11249.0, 80], [1238, 3679.0, 79], [1238, 17164.0, 78], [1238, 10724.0, 77], [1238, 5724.0, 76], [1238, 21040.0, 75], [1238, 3334.0, 74], [1238, 17585.0, 73], [1238, 9538.0, 72], [1238, 5570.0, 71], [1238, 1592.0, 70], [1238, 12872.0, 69], [1238, 20008.0, 68], [1238, 14652.0, 67], [1238, 4990.0, 66], [1238, 22736.0, 65], [1238, 11313.0, 64], [1238, 21160.0, 63], [1238, 4747.0, 62], [1238, 14539.0, 61], [1238, 22840.0, 60], [1238, 23293.0, 59], [1238, 10945.0, 58], [1238, 15121.0, 57], [1238, 23657.0, 56], [1238, 5369.0, 55], [1238, 4899.0, 54], [1238, 8434.0, 53], [1238, 2

In [51]:
list(map(list, zip(*p)))

[[1238, 10785.0, 100],
 [1238, 15542.0, 99],
 [1238, 27202.0, 98],
 [1238, 12518.0, 97],
 [1238, 24465.0, 96],
 [1238, 15844.0, 95],
 [1238, 15593.0, 94],
 [1238, 19372.0, 93],
 [1238, 21675.0, 92],
 [1238, 17715.0, 91],
 [1238, 485.0, 90],
 [1238, 10111.0, 89],
 [1238, 24887.0, 88],
 [1238, 14379.0, 87],
 [1238, 1641.0, 86],
 [1238, 5272.0, 85],
 [1238, 10304.0, 84],
 [1238, 409.0, 83],
 [1238, 10382.0, 82],
 [1238, 8807.0, 81],
 [1238, 11249.0, 80],
 [1238, 3679.0, 79],
 [1238, 17164.0, 78],
 [1238, 10724.0, 77],
 [1238, 5724.0, 76],
 [1238, 21040.0, 75],
 [1238, 3334.0, 74],
 [1238, 17585.0, 73],
 [1238, 9538.0, 72],
 [1238, 5570.0, 71],
 [1238, 1592.0, 70],
 [1238, 12872.0, 69],
 [1238, 20008.0, 68],
 [1238, 14652.0, 67],
 [1238, 4990.0, 66],
 [1238, 22736.0, 65],
 [1238, 11313.0, 64],
 [1238, 21160.0, 63],
 [1238, 4747.0, 62],
 [1238, 14539.0, 61],
 [1238, 22840.0, 60],
 [1238, 23293.0, 59],
 [1238, 10945.0, 58],
 [1238, 15121.0, 57],
 [1238, 23657.0, 56],
 [1238, 5369.0, 55],
 [1