In [34]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

In [78]:
#train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.csv',header=False,
#                                          inferSchema=True)
train_sessions_engineered = spark.read.load('../Data/session_engineered_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()


train_sessions_engineered = spark.read.load('../Data/TEMPtrain_sessions.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

22/05/28 08:59:07 WARN CacheManager: Asked to cache already cached data.        
                                                                                

In [79]:
train_purchases.printSchema()

root
 |-- session_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- date: string (nullable = true)



In [80]:
mrmr = [0,1, 2, 5, 11, 15, 16, 17, 19, 23, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42]
columns_to_keep = [train_sessions_engineered.columns[ind] for ind in mrmr] 
print(columns_to_keep)

for col in train_sessions_engineered.columns:
    if not col in columns_to_keep:
        train_sessions_engineered = train_sessions_engineered.drop(col)

['session_id', 'session_time', 'season', 'year', 'std_time', 'most_frequently_bought_for_most_revisited', 'first_item_visited', 'last_item_visited', '1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']


In [81]:
# BINING 
from pyspark.ml.feature import Bucketizer
to_bin = ['session_time','std_time']#,'1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']
for col in to_bin:
    print(col)
    column_values = train_sessions_engineered.select(col).collect()
    splits_list = [0]
    for i in range(10):
        p = 10 * (i+1)
        splits_list.append(np.percentile(column_values, p))
    splits_list.append(float('Inf'))
    bucketizer = Bucketizer(splits=list(np.unique(splits_list)),inputCol=col, outputCol=col+"_binned")
    train_sessions_engineered = bucketizer.setHandleInvalid("keep").transform(train_sessions_engineered)
    train_sessions_engineered = train_sessions_engineered.drop(col)
    train_sessions_engineered = train_sessions_engineered.withColumnRenamed(col+"_binned",col)

session_time


                                                                                

std_time


                                                                                

In [82]:
# join the dataframes on the session ids and drop useless columns
train_df = train_sessions_engineered.join(train_purchases,train_sessions_engineered.session_id == train_purchases.session_id,"inner" ).cache()
for col in ['session_id','date']:
    train_df = train_df.drop(col)

In [83]:
train_df.take(1) 

                                                                                

[Row(season=1.0, year=1.0, most_frequently_bought_for_most_revisited=39.0, first_item_visited=22.0, last_item_visited=5.0, 1=2.0, 5=0.0, 8=0.0, 9=0.0, 10=0.0, 11=0.0, 15=0.0, 16=0.0, 18=0.0, 19=0.0, 21=0.0, 22=0.0, 24=0.0, session_time=3.0, std_time=2.0, item_id=1640)]

In [84]:
train_df.write.option("header",True).csv('/PROJ/Data/data_processed_for_modelV2.csv')

                                                                                

# NAIVE BAYES

In [85]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

train_df = spark.read.load('/PROJ/Data/data_processed_for_modelV2.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

def if_p(row):
    row_bis = []
    for count,elem in enumerate(row):
        if elem == -1:
            row_bis.append(0)
        else:
            row_bis.append(elem)
    return row_bis

train_df = train_df.rdd.map(if_p)

def labelData(data):
    return data.map(lambda x: LabeledPoint(int(x[-1]), x[:-1]))

training_data, testing_data = labelData(train_df).randomSplit([0.9, 0.1])

model = NaiveBayes.train(training_data)

22/05/28 09:01:28 WARN CacheManager: Asked to cache already cached data.        
                                                                                

In [86]:
import numpy
from pyspark.mllib.linalg import _convert_to_vector
from pyspark import RDD

# define custom predict function 
def custom_predict(model_bayes, x):
    labels_ = model_bayes.labels
    pi = model_bayes.pi
    theta = model_bayes.theta
    if isinstance(x, RDD):
        return x.map(lambda v: custom_predict(model_bayes,v))
    x = _convert_to_vector(x)
    x = pi + x.dot(theta.transpose())
    indices = np.argsort(x)[-100:]
    top100 = labels_[indices]
    return top100 #labels_[numpy.argmax(pi + x.dot(theta.transpose()))]


In [87]:
testing_data.map(lambda p: (model.predict(p.features), p.label)).take(20)

[(10390.0, 2173.0),
 (14959.0, 4799.0),
 (13540.0, 9846.0),
 (15290.0, 11785.0),
 (12990.0, 21886.0),
 (13872.0, 14592.0),
 (8060.0, 11376.0),
 (19511.0, 18948.0),
 (8060.0, 26853.0),
 (27555.0, 115.0),
 (8060.0, 14529.0),
 (1333.0, 6943.0),
 (27613.0, 16064.0),
 (8060.0, 27225.0),
 (19882.0, 1374.0),
 (19882.0, 19968.0),
 (18981.0, 14336.0),
 (20236.0, 742.0),
 (27555.0, 9992.0),
 (8060.0, 20689.0)]

In [88]:
model.predict(testing_data.take(4)[3].features)

15290.0

# TOP 100 with bayes pred

In [89]:
top100 = custom_predict(model,testing_data.take(4)[3].features)
print(top100)

[10983. 19117. 17190. 22747.    45. 26785.  4015. 18239. 17297. 22669.
  3856. 13922. 22607.  7958. 20374. 25109.  3406. 27892.  5615.  2690.
 25213.  7640.  6899. 14733. 13596.  4028. 27362. 20770.   769. 11491.
 23216. 17431. 17339.  1644.  8755. 25794. 17221. 23088.  5683. 16700.
 12355. 14550. 18046. 13296.  3680.  9799. 18610. 14977. 14392.   972.
  8231.  9184.  7792.  3859. 15501.  3747. 18962. 25091. 22311. 11053.
  5657. 11162. 17376. 23286.  5481. 13558. 11209. 23956.  1572. 27225.
 23905. 19170. 13994.  9427. 15812.  8142. 13924. 21375. 27205. 22992.
  3106.  2814. 19521. 12665.  5367. 11237.  4375.  1752. 12556. 10385.
  6749.  8060. 27613. 20906. 14830.  8592.  5340. 15104. 25723. 15290.]


In [90]:
results_top100 = testing_data.map(lambda p: (p.label in custom_predict(model,p.features), p.label))

In [91]:
results_top100.take(10)

[(False, 2173.0),
 (True, 4799.0),
 (False, 9846.0),
 (False, 11785.0),
 (False, 21886.0),
 (False, 14592.0),
 (False, 11376.0),
 (False, 18948.0),
 (False, 26853.0),
 (True, 115.0)]

In [92]:
results_top100.map(lambda x : (x[0],(1))).reduceByKey(lambda x,y : np.add(x,y)).take(2)

                                                                                

[(False, 81461), (True, 18545)]

#### VALIDATION SET SCORE : 

In [93]:
def score(top_100,label):
    if label in top_100:
        return (np.where(np.array(top_100) == label)[0][0] + 1)/100
    else:
        return 0

In [94]:
testing_data.map(lambda p: (score(custom_predict(model,p.features),p.label))).mean()

                                                                                

0.1222926624402534

# Competition Prediction

In [95]:
test_df = spark.read.load('/PROJ/Data/test_engineered_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

train_sessions_engineered = spark.read.load('../Data/TEMPtrain_sessions.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')


                                                                                

In [96]:
test_df.take(1)

[Row(session_id=5400.0, session_time=76.0, season=2.0, day_period=1.0, month=5.0, year=1.0, item_most_time_spent=13.0, most_time_spent_on_item=65.0, most_frequently_bought_for_time_spent=1.0, least_frequently_bought_for_time_spent=13.0, mean_time=25.33333396911621, std_time=28.40578842163086, item_most_visited=13.0, number_o_visit=2.0, number_o_revisited_items=1.0, most_frequently_bought_for_most_revisited=5.0, first_item_visited=33.0, last_item_visited=13.0, normalized_features_vector=13.0, 1=6.0, 2=0.0, 3=0.0, 4=0.0, 5=0.0, 6=0.0, 7=0.0, 8=0.0, 9=0.0, 10=0.0, 11=0.0, 12=0.0, 13=0.0, 14=0.0, 15=0.0, 16=0.0, 17=0.0, 18=0.0, 19=0.0, 20=0.0, 21=0.0, 22=1.0, 23=0.0, 24=0.0, 25=0.0, _45=0.0)]

In [98]:
mrmr = [0,1, 2, 5, 11, 15, 16, 17, 19, 23, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42]
columns_to_keep = [test_df.columns[ind] for ind in mrmr] 
print(columns_to_keep)

for col in test_df.columns:
    if not col in columns_to_keep:
        test_df = test_df.drop(col)

['session_id', 'session_time', 'season', 'year', 'std_time', 'most_frequently_bought_for_most_revisited', 'first_item_visited', 'last_item_visited', '1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']


In [99]:
# BINING 
from pyspark.ml.feature import Bucketizer
to_bin = ['session_time','std_time']#,'1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']
for col in to_bin:
    print(col)
    column_values = train_sessions_engineered.select(col).collect()
    splits_list = [0]
    for i in range(10):
        p = 10 * (i+1)
        splits_list.append(np.percentile(column_values, p))
    splits_list.append(float('Inf'))
    bucketizer = Bucketizer(splits=list(np.unique(splits_list)),inputCol=col, outputCol=col+"_binned")
    test_df = bucketizer.setHandleInvalid("keep").transform(test_df)
    test_df = test_df.drop(col)
    test_df = test_df.withColumnRenamed(col+"_binned",col)

session_time


                                                                                

std_time


                                                                                

In [100]:
test_row = test_df.take(1)[0][1:]
print(test_row)

(2.0, 1.0, 5.0, 33.0, 13.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 2.0)


In [101]:
test_pred = custom_predict(model,test_row)

In [102]:
test_pred # where last item = top1 prediction

array([22432.,  5509., 16767., 15705., 16910.,  9279.,  4065., 17106.,
       25975., 13215., 21929., 20156., 20124., 19107., 26483.,  7827.,
       12062., 20122., 10825., 27914.,  5303.,  6470., 23684., 25401.,
       23547., 20441., 16959., 26353., 22591., 19642.,  8136.,  7341.,
       19837., 26413., 21407., 16093., 13317., 24639.,  2071., 11380.,
       22326.,  5994., 13650., 17209., 20242., 17777.,  5913., 15939.,
       17570., 20533., 27882., 11512., 24913., 22554.,  1363.,  6268.,
       19175., 20274.,  5032., 21398., 26970.,  6844.,  1446., 14580.,
        2130., 18225., 18200.,  8330., 26249., 24406., 15731., 20160.,
       13602., 13631., 14196., 11511., 22131.,  4497., 16104.,  8800.,
       22855., 21975.,  5226., 11695.,  4930.,   650., 12253., 23315.,
        6094., 26574.,  3403.,  1214.,   185., 27414., 10970.,  9514.,
       14734.,   262., 22547.,  6253.])

In [103]:
rank = [i for i in range(100,0,-1)]
df = []
for count, input_row in enumerate(test_df.collect()):
    if not count+1 % 5000 :
        print(count)
    session_id = int(input_row[0])
    pred = custom_predict(model,input_row[1:])
    df += [[session_id, int(pred[i]), rank[i]] for i in range(100)]
output_df = spark.createDataFrame([[session_id, int(pred[i]), rank[i]] for i in range(100)])

0
5000
10000
15000
20000
25000
30000
35000
40000
45000


In [104]:
output_df = spark.createDataFrame(df,['session_id','item_id','rank'])

In [105]:
output_df.coalesce(1).write.option("header",True).csv('/PROJ/Data/leaderboard_prediction.csv')

22/05/28 09:08:16 WARN TaskSetManager: Stage 147 contains a task of very large size (58532 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [106]:
output_df.count()

22/05/28 09:08:20 WARN TaskSetManager: Stage 148 contains a task of very large size (14623 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

5000000

In [107]:
output_df = spark.read.load('/PROJ/Data/leaderboard_prediction.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

22/05/28 09:08:23 WARN CacheManager: Asked to cache already cached data.        


In [108]:
output_df.take(10)

                                                                                

[Row(session_id=5400, item_id=22432, rank=100),
 Row(session_id=5400, item_id=5509, rank=99),
 Row(session_id=5400, item_id=16767, rank=98),
 Row(session_id=5400, item_id=15705, rank=97),
 Row(session_id=5400, item_id=16910, rank=96),
 Row(session_id=5400, item_id=9279, rank=95),
 Row(session_id=5400, item_id=4065, rank=94),
 Row(session_id=5400, item_id=17106, rank=93),
 Row(session_id=5400, item_id=25975, rank=92),
 Row(session_id=5400, item_id=13215, rank=91)]