In [28]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

In [15]:
train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.txt',header=False,
                                          inferSchema=True)


train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')
clusters = spark.read.load('../Data/clusters.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true') # column 1 (2) = item_id, column 2 (136) = cluster
clusters = clusters.withColumnRenamed("2","item_id_")
clusters = clusters.withColumnRenamed("136","cluster_id")


                                                                                

In [16]:
selected_features_indices = [-1, 36, 4, 0, 34, 17, 35, 18, 32, 5, 33, 16, 31, 21, 19, 28, 26, 30, 14, 23, 13]
columns_to_keep = []
for elem in selected_features_indices:
    columns_to_keep.append(train_sessions_engineered.columns[elem+1])
train_sessions_engineered = train_sessions_engineered.select(columns_to_keep)
train_sessions_engineered.take(1)

[Row(_c0=720, _c37=0.0, _c5=2021.0, _c1=23943.0, _c35=0.0, _c18=0.0, _c36=0.0, _c19=0.0, _c33=0.0, _c6=21890.0, _c34=0.0, _c17=0.0, _c32=0.0, _c22=0.0, _c20=0.0, _c29=0.0, _c27=0.0, _c31=0.0, _c15=0.0, _c24=0.0, _c14=0.0)]

In [17]:
# join the dataframes on the session ids and drop useless columns
train_df = train_sessions_engineered.join(train_purchases,train_sessions_engineered._c0 == train_purchases.session_id,"inner" )
for col in ['_c0','session_id','date']:
    train_df = train_df.drop(col)
    


In [18]:
train_df.take(1)

                                                                                

[Row(_c37=0.0, _c5=2020.0, _c1=0.0, _c35=0.0, _c18=0.0, _c36=0.0, _c19=1.0, _c33=0.0, _c6=15654.0, _c34=0.0, _c17=0.0, _c32=0.0, _c22=0.0, _c20=0.0, _c29=0.0, _c27=0.0, _c31=0.0, _c15=0.0, _c24=0.0, _c14=0.0, item_id=18626)]

In [21]:
train_df.count()

                                                                                

1000000

# NAIVE BAYES (without clustering approx)

In [19]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

def labelData(data):
    return data.rdd.map(lambda x: LabeledPoint(int(x[-1]), x[:-1]))

training_data, testing_data = labelData(train_df).randomSplit([0.8, 0.2])

model = NaiveBayes.train(training_data)

                                                                                

In [20]:
def comp(row):
    if row[0] == row[1]:
        return (True,(1))
    else:
        return (False,(1))
    
testing_data.map(lambda p: (model.predict(p.features), p.label)).map(lambda x : comp(x)).reduceByKey(lambda x,y : np.add(x,y)).take(2)


                                                                                

[(False, 199250), (True, 8)]

In [22]:
import numpy
from pyspark.mllib.linalg import _convert_to_vector
from pyspark import RDD

# define custom predict function 
def custom_predict(model_bayes, x):
    labels_ = model_bayes.labels
    pi = model_bayes.pi
    theta = model_bayes.theta
    if isinstance(x, RDD):
        return x.map(lambda v: custom_predict(model_bayes,v))
    x = _convert_to_vector(x)
    x = pi + x.dot(theta.transpose())
    indices = np.argpartition(x, -100)[-100:]
    top100 = labels_[indices]
    return top100 #labels_[numpy.argmax(pi + x.dot(theta.transpose()))]

In [23]:
testing_data.map(lambda p: (model.predict(p.features), p.label)).take(20)

                                                                                

[(21632.0, 8345.0),
 (20947.0, 4028.0),
 (7153.0, 25976.0),
 (20947.0, 25415.0),
 (18392.0, 8193.0),
 (21632.0, 1640.0),
 (15932.0, 717.0),
 (15743.0, 3774.0),
 (20297.0, 24200.0),
 (6753.0, 19157.0),
 (17321.0, 23185.0),
 (14369.0, 5642.0),
 (21571.0, 9262.0),
 (22108.0, 9184.0),
 (8322.0, 21943.0),
 (11495.0, 21781.0),
 (22831.0, 27499.0),
 (25936.0, 4917.0),
 (16297.0, 5146.0),
 (21715.0, 974.0)]

# TOP 100 with bayes pred

In [27]:
top100 = custom_predict(model,testing_data.take(4)[3].features)
print(testing_data.take(4)[3].label)
print(top100)

[Stage 82:>                                                         (0 + 1) / 1]

25415.0
[ 2877. 17937. 11495. 12702. 11093. 18971. 27176. 18317. 20396. 21841.
  4435. 12079. 19241. 13319. 18839. 23896. 15040.  9870.  8184. 22157.
   676. 15521.  7800. 21549. 18548.  9639. 27625.  8769. 20457. 25083.
 16466.   312.  5251. 15335.  3941.  4047.  7438. 14174.  1341. 20292.
  2521. 23677. 21815.  9340. 23087.  3539. 19947.  2980.  9278. 25989.
 14456.  6973. 27993.  3892. 15398. 17321. 23968.  4090. 20947. 11767.
  6734. 22624. 12542. 21734.  6614. 17773. 25638. 10825.  2417. 20984.
 24202. 27789. 13754. 15106. 10718. 25471. 24316. 13249. 15919.  8881.
  8841. 11285.  6281.  1269. 19703. 23031. 12468.  8578. 22842. 27019.
 27371.  5047.  6013. 21715. 18152.  5989.  8274. 14177. 16616. 20963.]


                                                                                

In [25]:
results_top100 = testing_data.map(lambda p: (p.label in custom_predict(model,p.features), p.label))

In [26]:
results_top100.map(lambda x : (x[0],(1))).reduceByKey(lambda x,y : np.add(x,y)).take(2)

                                                                                

[(False, 199036), (True, 222)]