In [9]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

In [10]:
train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.txt',header=False,
                                          inferSchema=True)


train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')


                                                                                

In [3]:
selected_features_indices = [-1, 36, 4, 0, 34, 17, 35, 18, 32, 5, 33, 16, 31, 21, 19, 28, 26, 30, 14, 23, 13]
columns_to_keep = []
for elem in selected_features_indices:
    columns_to_keep.append(train_sessions_engineered.columns[elem+1])
train_sessions_engineered = train_sessions_engineered.select(columns_to_keep)
train_sessions_engineered.take(1)

[Row(_c0=720, _c37=0.0, _c5=2021.0, _c1=23943.0, _c35=0.0, _c18=0.0, _c36=0.0, _c19=0.0, _c33=0.0, _c6=21890.0, _c34=0.0, _c17=0.0, _c32=0.0, _c22=0.0, _c20=0.0, _c29=0.0, _c27=0.0, _c31=0.0, _c15=0.0, _c24=0.0, _c14=0.0)]

In [11]:
# join the dataframes on the session ids and drop useless columns
train_df = train_sessions_engineered.join(train_purchases,train_sessions_engineered._c0 == train_purchases.session_id,"inner" )
for col in ['_c0','session_id','date']:
    train_df = train_df.drop(col)
    


In [12]:
summary = train_df.describe().toPandas().set_index('summary').transpose()
print(summary)

dic_val = {}
for i in range(1,39):
    row = summary.iloc[i-1]
    dic_val[i] = (float(row[1]),float(row[2]))



summary    count                   mean               stddev     min      max
_c1      1000000             2249.79697   7537.1895505977045     0.0  86006.0
_c2      1000000               1.388241   1.1001686853797183     0.0      3.0
_c3      1000000               3.032616   1.3751400247080612     0.0      5.0
_c4      1000000               6.078554    3.443764093052108     1.0     12.0
_c5      1000000            2020.312907  0.46367706903339845  2020.0   2021.0
_c6      1000000           13995.014149    8199.369861587444     3.0  28143.0
_c7      1000000            2576.144313   7783.3387749604935     0.0  85551.0
_c8      1000000      889.8452960704425    3567.530451927126     0.0  85551.0
_c9      1000000      801.0423318404028   2756.8595655441545     0.0  42422.5
_c10     1000000            3686.692673    7297.767575560808    -1.0  28143.0
_c11     1000000              -0.016195     1.66395357228394    -1.0     97.0
_c12     1000000               0.475118   1.1018965923588733    

22/05/26 07:57:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [13]:
broadcast_clusters = sc.broadcast(dic_val)
def standardize_column(row, br):
    dic = br.value
    temp = []
    for i in range(1,39):
        if i != 37:
            elem = dic.get(i)
            temp.append((row[i-1]-elem[0])/elem[1])
        else:
            temp.append(row[i-1])
    temp.append(row[-1])
    return tuple(temp)
#print(standardize_column(train_df.take(1)[0], broadcast_clusters))
train_df = train_df.rdd.map(lambda x: standardize_column(x,broadcast_clusters))



In [14]:
train_df.take(1)[0]

                                                                                

(-0.29849282081828354,
 -1.2618437685497792,
 0.7034803602675831,
 -0.8939503162284169,
 -0.6748382029162996,
 0.20233089603288254,
 -0.3248148880700406,
 -0.23597424252277663,
 -0.29056334310677573,
 -0.5053179119254991,
 -0.5912454628464368,
 -0.4311820213391315,
 0.7778015314192567,
 -0.21966859401106706,
 -0.14024313543965866,
 -0.28156845614887005,
 -0.11739063711586989,
 -0.27819747636939873,
 11.262389284075562,
 -0.1976933864145327,
 -0.2737890026602472,
 -0.0674906920240801,
 -0.16416137125151084,
 -0.1446736676778695,
 -0.7161474112180884,
 -0.3304611746766584,
 -0.08633323341511491,
 -0.30663512269034976,
 -0.13455510481389857,
 -0.3613905983939951,
 -0.14040132047455203,
 -0.2765116961929063,
 -0.1201674350042513,
 -0.09547745631666549,
 -0.18379670846051085,
 -0.07471282235560649,
 0.0,
 -0.4138766997718175,
 18626)

# KNN

In [15]:
training_data, testing_data = train_df.randomSplit([0.8, 0.2])

def distance_knn(x, input_row):
    add = 0
    for count, elem in enumerate(x) :
        add += np.abs(elem-input_row[count])
    return add

def pred_knn(data,input_row):
    distance_set = data.map(lambda x : (distance_knn(x[:-1], input_row),x[-1]))
    sorted_set = distance_set.sortBy(lambda x: x[0], ascending=False)
    return(sorted_set)

test_100 = testing_data.take(100)
for i in range(100):
    row = test_100[i]
    res = pred_knn(training_data,row).take(100)
    print(row[-1] in res.flatMap(lambda x: x[1]).collect(), row[-1])
    

                                                                                

AttributeError: 'list' object has no attribute 'flatMap'

In [None]:
testing_data.take(1)[0][-1]