In [1]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/27 10:37:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/27 10:37:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/27 10:37:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
#train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.csv',header=False,
#                                          inferSchema=True)
train_sessions_engineered = spark.read.load('../Data/session_engineered_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

22/05/27 10:38:02 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [3]:
train_purchases.printSchema()

root
 |-- session_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- date: string (nullable = true)



In [4]:
mrmr = [0,1, 2, 5, 11, 15, 16, 17, 19, 23, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42]
columns_to_keep = [train_sessions_engineered.columns[ind] for ind in mrmr] 
print(columns_to_keep)

for col in train_sessions_engineered.columns:
    if not col in columns_to_keep:
        train_sessions_engineered = train_sessions_engineered.drop(col)

['session_id', 'session_time', 'season', 'year', 'std_time', 'most_frequently_bought_for_most_revisited', 'first_item_visited', 'last_item_visited', '1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']


In [5]:
# BINING 
from pyspark.ml.feature import Bucketizer
to_bin = ['session_time','std_time']#,'1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']
for col in to_bin:
    print(col)
    column_values = train_sessions_engineered.select(col).collect()
    splits_list = [0]
    for i in range(10):
        p = 10 * (i+1)
        splits_list.append(np.percentile(column_values, p))
    splits_list.append(float('Inf'))
    bucketizer = Bucketizer(splits=list(np.unique(splits_list)),inputCol=col, outputCol=col+"_binned")
    train_sessions_engineered = bucketizer.setHandleInvalid("keep").transform(train_sessions_engineered)
    train_sessions_engineered = train_sessions_engineered.drop(col)
    train_sessions_engineered = train_sessions_engineered.withColumnRenamed(col+"_binned",col)

session_time


                                                                                

std_time


                                                                                

In [6]:
# join the dataframes on the session ids and drop useless columns
train_df = train_sessions_engineered.join(train_purchases,train_sessions_engineered.session_id == train_purchases.session_id,"inner" ).cache()
for col in ['session_id','date']:
    train_df = train_df.drop(col)

In [7]:
train_df.take(1) 

                                                                                

[Row(season=1.0, year=1.0, most_frequently_bought_for_most_revisited=5.0, first_item_visited=23.0, last_item_visited=23.0, 1=2.0, 5=0.0, 8=0.0, 9=0.0, 10=0.0, 11=0.0, 15=0.0, 16=0.0, 18=0.0, 19=0.0, 21=0.0, 22=0.0, 24=0.0, session_time=3.0, std_time=2.0, item_id=1640)]

In [9]:
train_df.write.option("header",True).csv('/PROJ/Data/data_processed_for_modelV2.csv')

                                                                                

# NAIVE BAYES

In [37]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

train_df = spark.read.load('/PROJ/Data/data_processed_for_modelV2.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

def if_p(row):
    row_bis = []
    for count,elem in enumerate(row):
        if elem == -1:
            row_bis.append(0)
        else:
            row_bis.append(elem)
    return row_bis

train_df = train_df.rdd.map(if_p)

def labelData(data):
    return data.map(lambda x: LabeledPoint(int(x[-1]), x[:-1]))

training_data, testing_data = labelData(train_df).randomSplit([0.9, 0.1])

model = NaiveBayes.train(training_data)

22/05/27 10:57:51 WARN CacheManager: Asked to cache already cached data.        
                                                                                

In [38]:
import numpy
from pyspark.mllib.linalg import _convert_to_vector
from pyspark import RDD

# define custom predict function 
def custom_predict(model_bayes, x):
    labels_ = model_bayes.labels
    pi = model_bayes.pi
    theta = model_bayes.theta
    if isinstance(x, RDD):
        return x.map(lambda v: custom_predict(model_bayes,v))
    x = _convert_to_vector(x)
    x = pi + x.dot(theta.transpose())
    indices = np.argsort(x)[-100:]
    top100 = labels_[indices]
    return top100 #labels_[numpy.argmax(pi + x.dot(theta.transpose()))]


In [39]:
testing_data.map(lambda p: (model.predict(p.features), p.label)).take(20)

[(8025.0, 13063.0),
 (19452.0, 17760.0),
 (4193.0, 9846.0),
 (25007.0, 13712.0),
 (25007.0, 21004.0),
 (23972.0, 7566.0),
 (8060.0, 16876.0),
 (18981.0, 14592.0),
 (8060.0, 795.0),
 (8060.0, 11376.0),
 (19882.0, 3273.0),
 (8060.0, 20599.0),
 (8060.0, 4446.0),
 (15128.0, 23272.0),
 (4130.0, 23170.0),
 (4193.0, 26123.0),
 (25374.0, 12263.0),
 (2133.0, 22788.0),
 (9184.0, 22974.0),
 (8060.0, 21204.0)]

In [40]:
model.predict(testing_data.take(4)[3].features)

25007.0

# TOP 100 with bayes pred

In [41]:
top100 = custom_predict(model,testing_data.take(4)[3].features)
print(top100)

[18483. 19940. 12179. 14550. 28133.  3741.  3373. 16064. 18723. 26433.
  4886.  2133. 19170.  6146.  4028. 13698. 10090. 11196. 11923. 20202.
  5414.  7640. 26876. 24367.  4917. 27330. 11227. 13124. 17089. 15501.
 24022. 23451. 10983. 19882.  6685.  2173. 27087. 24034. 23794.   753.
   972. 27835.  1367. 23088. 10223. 17316.  2570. 25516.   412. 28035.
  5810.   773.  8746. 20599. 12251.  1272.  7260. 17201. 25785.  9427.
 15107.  3235. 27225.  6035. 14392. 19955.  6412.  8959. 17648.  2366.
 21781. 27222.  6806.  1597.   434. 10717. 18156.  2072.  9522. 22607.
  1953. 23967. 20770.  1365. 24921. 13226. 10486. 19087.  8057.  2447.
  2314. 25811. 21148.  1199. 10390.  1358.  8398.  8622.  8060. 25007.]


In [42]:
results_top100 = testing_data.map(lambda p: (p.label in custom_predict(model,p.features), p.label))

In [43]:
results_top100.take(10)

[(False, 13063.0),
 (False, 17760.0),
 (False, 9846.0),
 (False, 13712.0),
 (False, 21004.0),
 (False, 7566.0),
 (True, 16876.0),
 (False, 14592.0),
 (False, 795.0),
 (False, 11376.0)]

In [44]:
results_top100.map(lambda x : (x[0],(1))).reduceByKey(lambda x,y : np.add(x,y)).take(2)

                                                                                

[(False, 80670), (True, 19417)]

#### VALIDATION SET SCORE : 

In [45]:
def score(top_100,label):
    if label in top_100:
        return (np.where(np.array(top_100) == label)[0][0] + 1)/100
    else:
        return 0

In [46]:
testing_data.map(lambda p: (score(custom_predict(model,p.features),p.label))).mean()

                                                                                

0.12866616044041698

# Competition Prediction

In [149]:
test_df = spark.read.load('/PROJ/Data/test_engineered_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

train_sessions_engineered = spark.read.load('../Data/session_engineered_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

22/05/27 14:03:25 WARN CacheManager: Asked to cache already cached data.
22/05/27 14:03:28 WARN CacheManager: Asked to cache already cached data.        


In [150]:
test_df.take(1)

[Row(session_id=5400.0, session_time=76.0, season=2.0, day_period=1.0, month=5.0, year=1.0, item_most_time_spent=13.0, most_time_spent_on_item=65.0, most_frequently_bought_for_time_spent=1.0, least_frequently_bought_for_time_spent=13.0, mean_time=25.33333396911621, std_time=28.40578842163086, item_most_visited=13.0, number_o_visit=2.0, number_o_revisited_items=1.0, most_frequently_bought_for_most_revisited=5.0, first_item_visited=33.0, last_item_visited=13.0, normalized_features_vector=13.0, 1=6.0, 2=0.0, 3=0.0, 4=0.0, 5=0.0, 6=0.0, 7=0.0, 8=0.0, 9=0.0, 10=0.0, 11=0.0, 12=0.0, 13=0.0, 14=0.0, 15=0.0, 16=0.0, 17=0.0, 18=0.0, 19=0.0, 20=0.0, 21=0.0, 22=1.0, 23=0.0, 24=0.0, 25=0.0, _45=0.0)]

In [151]:
mrmr = [0,1, 2, 5, 11, 15, 16, 17, 19, 23, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42]
columns_to_keep = [test_df.columns[ind] for ind in mrmr] 
print(columns_to_keep)

for col in test_df.columns:
    if not col in columns_to_keep:
        test_df = test_df.drop(col)

['session_id', 'session_time', 'season', 'year', 'std_time', 'most_frequently_bought_for_most_revisited', 'first_item_visited', 'last_item_visited', '1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']


In [152]:
# BINING 
from pyspark.ml.feature import Bucketizer
to_bin = ['session_time','std_time']#,'1', '5', '8', '9', '10', '11', '15', '16', '18', '19', '21', '22', '24']
for col in to_bin:
    print(col)
    column_values = train_sessions_engineered.select(col).collect()
    splits_list = [0]
    for i in range(10):
        p = 10 * (i+1)
        splits_list.append(np.percentile(column_values, p))
    splits_list.append(float('Inf'))
    bucketizer = Bucketizer(splits=list(np.unique(splits_list)),inputCol=col, outputCol=col+"_binned")
    test_df = bucketizer.setHandleInvalid("keep").transform(test_df)
    test_df = test_df.drop(col)
    test_df = test_df.withColumnRenamed(col+"_binned",col)

session_time
std_time


In [153]:
test_row = test_df.take(1)[0][1:]
print(test_row)

(2.0, 1.0, 5.0, 33.0, 13.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 2.0)


In [154]:
test_pred = custom_predict(model,test_row)

In [155]:
test_pred # where last item = top1 prediction

array([ 3679., 10372., 14782.,  1037., 25632., 21890., 22863., 23578.,
        5335., 23719., 26836., 18868., 26585.,  5433., 19402.,  5005.,
       10724., 26565.,    74.,  4899.,  4171., 21343., 22886., 27001.,
       21272., 26249., 10953., 19145., 24286., 26097.,  8881.,  9406.,
       24538.,  9284., 24989., 11275., 24870., 15593., 21616., 16967.,
       11923.,  4611., 22719.,  3438., 19372., 20449., 11804., 20612.,
        1148.,  7464., 12975., 20303.,  7902.,  7096., 17571., 16766.,
        6196., 26853.,  2967.,  7750., 22524.,  8188.,  6762., 18489.,
        2188., 15542., 12555.,  5021.,   955., 27393., 19348.,  6736.,
       13656., 23450., 18289.,  4364., 22175., 22209., 24736., 14648.,
       15249., 18509., 19528.,   340., 15424.,  4254., 16929., 10338.,
       16846., 20257., 21215., 13987.,   436.,  5463.,   403.,  8577.,
        8807.,  4193., 27852., 23789.])

In [None]:
array([10785., 15542., 27202., 12518., 24465., 15844., 15593., 19372.,
       21675., 17715.,   485., 10111., 24887., 14379.,  1641.,  5272.,
       10304.,   409., 10382.,  8807., 11249.,  3679., 17164., 10724.,
        5724., 21040.,  3334., 17585.,  9538.,  5570.,  1592., 12872.,
       20008., 14652.,  4990., 22736., 11313., 21160.,  4747., 14539.,
       22840., 23293., 10945., 15121., 23657.,  5369.,  4899.,  8434.,
       20724.,  4296., 20604., 17864., 25632., 16029., 12912., 19320.,
       10278., 26258.,  1513., 17005., 19757.,  3720.,  9344., 16441.,
       21001., 24888., 22811., 27086., 27989., 14988.,  5282., 13746.,
       13537., 10295.,  6157., 21819., 18258., 22331.,  3966.,  8017.,
        6762., 20258., 27937., 25308., 27403., 25081., 15188., 14160.,
       14451., 26373., 20083.,  1960., 13955., 16368., 17461.,  3319.,
        3184., 27819., 10976., 10605.])

In [156]:
rank = [i for i in range(100,0,-1)]
df = []
for count, input_row in enumerate(test_df.collect()):
    if not count % 5000 :
        print(count)
    session_id = int(input_row[0])
    pred = custom_predict(model,input_row[1:])
    df += [[session_id, int(pred[i]), rank[i]] for i in range(100)]
output_df = spark.createDataFrame([[session_id, int(pred[i]), rank[i]] for i in range(100)])

0
5000
10000
15000
20000
25000
30000
35000
40000
45000


In [157]:
output_df = spark.createDataFrame(df,['session_id','item_id','rank'])

In [158]:
output_df.coalesce(1).write.option("header",True).csv('/PROJ/Data/leaderboard_prediction.csv')

22/05/27 14:07:13 WARN TaskSetManager: Stage 135 contains a task of very large size (58512 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [159]:
output_df.count()

22/05/27 14:07:16 WARN TaskSetManager: Stage 136 contains a task of very large size (14618 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

5000000

In [160]:
output_df = spark.read.load('/PROJ/Data/leaderboard_prediction.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').cache()

22/05/27 14:07:20 WARN CacheManager: Asked to cache already cached data.        


In [161]:
output_df.take(10)

                                                                                

[Row(session_id=5400, item_id=3679, rank=100),
 Row(session_id=5400, item_id=10372, rank=99),
 Row(session_id=5400, item_id=14782, rank=98),
 Row(session_id=5400, item_id=1037, rank=97),
 Row(session_id=5400, item_id=25632, rank=96),
 Row(session_id=5400, item_id=21890, rank=95),
 Row(session_id=5400, item_id=22863, rank=94),
 Row(session_id=5400, item_id=23578, rank=93),
 Row(session_id=5400, item_id=5335, rank=92),
 Row(session_id=5400, item_id=23719, rank=91)]