In [2]:
import pyspark
from pyspark.sql import SparkSession

# REDIS CONFIGURATION
redis_host = "redis"
redis_port = "6379"

spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.redis.host", redis_host)\
      .config("spark.redis.port", redis_port)\
      .config("spark.jars.packages","com.redislabs:spark-redis_2.12:3.0.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [4]:
ex_score = spark.read.options(header= True, inferSchema = True).csv("file:///home/jovyan/datasets/exam-scores/*.csv")
ex_score.printSchema()

root
 |-- Class_Section: string (nullable = true)
 |-- Exam_Version: string (nullable = true)
 |-- Completion_Time: integer (nullable = true)
 |-- Made_Own_Study_Guide: string (nullable = true)
 |-- Did_Exam_Prep Assignment: string (nullable = true)
 |-- Studied_In_Groups: string (nullable = true)
 |-- Student_Score: integer (nullable = true)
 |-- Percentage: string (nullable = true)
 |-- Letter_Grade: string (nullable = true)



In [5]:
#Create a Temp View
ex_scores2 = spark.read.options(header = True, inferSchema = True)\
    .csv("file:///home/jovyan/datasets/exam-scores/*.csv")

ex_scores2.createOrReplaceTempView("examscores")

In [7]:
# Create Ids for hash id
query = '''
select 
row_number() over (order by Class_Section) + 1000 as rowid,
* 
from examscores
'''

ex_sc = spark.sql(query)
ex_sc.printSchema()

root
 |-- rowid: integer (nullable = true)
 |-- Class_Section: string (nullable = true)
 |-- Exam_Version: string (nullable = true)
 |-- Completion_Time: integer (nullable = true)
 |-- Made_Own_Study_Guide: string (nullable = true)
 |-- Did_Exam_Prep Assignment: string (nullable = true)
 |-- Studied_In_Groups: string (nullable = true)
 |-- Student_Score: integer (nullable = true)
 |-- Percentage: string (nullable = true)
 |-- Letter_Grade: string (nullable = true)



In [8]:
# load redis with data and the rowid
ex_sc.write.format("org.apache.spark.sql.redis")\
.mode("overwrite")\
.option("table","examscores")\
.option("key.column","rowid")\
.save()


In [9]:
# Read the data from redis
ex_sc1 = spark.read.format("org.apache.spark.sql.redis")\
.option("table","examscores")\
.option("key.column","rowid").load()

ex_sc1.toPandas()

Unnamed: 0,rowid,Class_Section,Exam_Version,Completion_Time,Made_Own_Study_Guide,Did_Exam_Prep Assignment,Studied_In_Groups,Student_Score,Percentage,Letter_Grade
0,1008,M01,B,15,Y,Y,Y,26,86.70%,B+
1,1062,M02,D,45,Y,Y,N,22,73.30%,C+
2,1040,M02,B,15,N,N,Y,19,63.30%,C-
3,1052,M02,C,35,N,Y,Y,23,76.70%,B-
4,1041,M02,B,25,?,?,?,25,83.30%,B
...,...,...,...,...,...,...,...,...,...,...
60,1025,M01,D,40,?,?,?,20,66.70%,C
61,1047,M02,B,55,Y,N,N,17,56.70%,D
62,1015,M01,C,20,Y,Y,Y,26,86.70%,B+
63,1006,M01,A,60,N,Y,Y,25,83.30%,B


In [18]:
from pyspark.sql.functions import *

examsummary=ex_sc1.groupBy().agg(min(col("Student_Score")).alias("min_score"),
    avg(col("Student_Score")).alias("avg_score"),
    max(col("Student_Score")).alias("max_score"))

examsummary.write.format("org.apache.spark.sql.redis")\
.mode("overwrite")\
.option("table","examscoresummary")\
.save()
