In [2]:
import pandas as pd

In [3]:
#importing the findspark and entering the path where the spark 2.3.0 is installed
import findspark
findspark.init()

In [4]:
#importing the pyspark 
import pyspark

In [5]:
#creating a spark session from the pyspark.sql
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [6]:
#creating a sample dataframe to test whether the spark sql is running or not
df = spark.sql('''Select 'spark' as hello ''')
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [7]:
#Using the jupyter it automatically creates the SparkContext(sc) and now we need to create the SQLContext with in the spark context
from pyspark.sql import SQLContext

In [8]:
#initializing the spark session and spark context
sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)

In [9]:
product = pd.read_csv('/Users/katie/Desktop/kz/products.csv')
product.head()

Unnamed: 0,product_id,product_name,department_id
0,1,Chocolate Sandwich Cookies,19
1,2,All-Seasons Salt,13
2,3,Robust Golden Unsweetened Oolong Tea,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,1
4,5,Green Chile Anytime Sauce,13


In [10]:
order = pd.read_csv('/Users/katie/Desktop/kz/order-product.csv')
order.head()

Unnamed: 0,order_id,product_id,add_to_cart_order
0,1,49302,1
1,1,11109,2
2,1,10246,3
3,1,49683,4
4,1,43633,5


In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [12]:
prod_data =sqlContext.read.load('/Users/katie/Desktop/kz/products.csv',
                                format='com.databricks.spark.csv',header='true',inferSchema='true')

In [13]:
prod_data.describe().show()

+-------+------------------+--------------------+------------------+
|summary|        product_id|        product_name|     department_id|
+-------+------------------+--------------------+------------------+
|  count|             49688|               49688|             49688|
|   mean|           24844.5|              1493.0|11.728580916537524|
| stddev|14343.834424588147|   620.4329133758138| 5.850420587359203|
|    min|                 1|"""Constant Comme...|           Blunted|
|    max|             49688|with a Splash of ...|                 9|
+-------+------------------+--------------------+------------------+



In [14]:
order_data =sqlContext.read.load('/Users/katie/Desktop/kz/order-product.csv',
                                 format='com.databricks.spark.csv',header='true',inferSchema='true')

In [15]:
order_data.describe().show()

+-------+------------------+------------------+-----------------+
|summary|          order_id|        product_id|add_to_cart_order|
+-------+------------------+------------------+-----------------+
|  count|           1048575|           1048575|          1048575|
|   mean|1289710.0699845029| 25559.30180578404|  8.7418372553227|
| stddev| 748377.6731595509|14118.370817735338|7.411503175819715|
|    min|                 1|                 1|                1|
|    max|           2593147|             49688|               80|
+-------+------------------+------------------+-----------------+



In [16]:
final=order_data.join(prod_data, ["product_id"])

In [17]:
final.show()

+----------+--------+-----------------+--------------------+-------------+
|product_id|order_id|add_to_cart_order|        product_name|department_id|
+----------+--------+-----------------+--------------------+-------------+
|     49302|       1|                1|    Bulgarian Yogurt|           16|
|     11109|       1|                2|Organic 4% Milk F...|           16|
|     10246|       1|                3|Organic Celery He...|            4|
|     49683|       1|                4|      Cucumber Kirby|            4|
|     43633|       1|                5|Lightly Smoked Sa...|           15|
|     13176|       1|                6|Bag of Organic Ba...|            4|
|     47209|       1|                7|Organic Hass Avocado|            4|
|     22035|       1|                8|Organic Whole Str...|           16|
|     39612|      36|                1|Grated Pecorino R...|           16|
|     19660|      36|                2|        Spring Water|            7|
|     49235|      36|    

In [18]:
# Smaller dataset so we will use 0.8 / 0.2
(training, test) = order_data.randomSplit([0.9, 0.1])

In [19]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="order_id", itemCol="product_id", ratingCol="add_to_cart_order", 
          seed = 0, nonnegative =True, rank = 10)
model = als.fit(training)

In [20]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [21]:
predictions.show()

+--------+----------+-----------------+----------+
|order_id|product_id|add_to_cart_order|prediction|
+--------+----------+-----------------+----------+
|  732117|       148|               24| 14.726572|
|  982020|       148|                8| 16.189732|
|  134443|       148|               14|  7.490478|
| 2350833|       148|                8| 7.3470254|
| 2194412|       148|                8|  9.529991|
| 2399918|       148|                3| 5.3127213|
|  849006|       148|                7|  13.04732|
|  202617|       148|                4| 1.9624496|
| 1107031|       148|                5|   12.5258|
| 1455311|       148|                3| 3.4409857|
|  543415|       471|                5|  4.646825|
|  822673|       471|                6| 4.5432596|
| 2305080|       496|                6|0.71771514|
| 1215936|      1238|                7|0.98448896|
| 1289679|      2366|                1| 2.0956686|
| 2180050|      4818|               14|  4.277628|
| 2576430|      4818|          

In [23]:
#predictions.na().drop(["prediction"])

In [24]:
#evaluator = RegressionEvaluator(metricName="rmse", labelCol="add_to_cart_order",predictionCol="prediction")
#rmse = evaluator.evaluate(predictions)
#print("Root-mean-square error = " + str(rmse))

In [22]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark Feedforward neural network example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()