In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


# Basic Spark Dataframe Operations - Part 1

In [None]:
# Testing of kernal setup
# import os
# os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/temurin-17.jdk/Contents/Home'

# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName('Basics').getOrCreate()

# # Test that spark.read is working
# print("Available methods on spark.read:")
# print([method for method in dir(spark.read) if not method.startswith('_')])


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/24 13:06:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Available methods on spark.read:
['csv', 'format', 'jdbc', 'json', 'load', 'option', 'options', 'orc', 'parquet', 'schema', 'table', 'text', 'xml']


In [1]:
import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/temurin-17.jdk/Contents/Home'

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Basics').getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/24 13:40:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.json('data/people.json')

In [3]:
df.show() # Prints the dataframe

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



In [4]:
df.printSchema() # Prints Schema of the dataframe attributes in a tree format

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.columns # Returns column names of dataframe

['age', 'name']

In [6]:
df.describe().show() # Provides a statistical summary of data

+-------+------------------+-----+
|summary|               age| name|
+-------+------------------+-----+
|  count|                 9|   12|
|   mean|32.333333333333336| NULL|
| stddev|10.222524150130436| NULL|
|    min|                19| Andy|
|    max|                51|Sarah|
+-------+------------------+-----+



# Chaning Schema Type

In [7]:
from pyspark.sql.types import (StructField, StringType, IntegerType, StructType)

In [8]:
data_schema = [StructField('age', IntegerType(), True),
               StructField('name', StringType(), True)]

In [9]:
final_struc = StructType(fields=data_schema)

In [11]:
df = spark.read.json('data/people.json', schema=final_struc)

df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



# Basic Spark Dataframe Operations - Part 2

In [12]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



In [13]:
type(df['age']) # Returning a column of age

pyspark.sql.classic.column.Column

In [14]:
df.select('age').show() # Returning a dataframe with single column age

+----+
| age|
+----+
|NULL|
|  30|
|  19|
|  25|
|NULL|
|  42|
|  35|
|  22|
|NULL|
|  51|
|  29|
|  38|
+----+



In [15]:
df.select('age', 'name').show() # Returning a dataframe with two columns age and name

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



In [16]:
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



In [21]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [26]:
# withColumn function helps creating a new column or replace an existing column in a dataframe
# Not an inplace operation
df.withColumn('double_age', df['age']*2).show() 
# df.withColumn('triple_age', df['age']*3).show() 


+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|NULL|Michael|      NULL|
|  30|   Andy|        60|
|  19| Justin|        38|
|  25|  Sarah|        50|
|NULL|  David|      NULL|
|  42|Jessica|        84|
|  35|  Chris|        70|
|  22|  Emily|        44|
|NULL| Daniel|      NULL|
|  51|  Laura|       102|
|  29| Robert|        58|
|  38|  Linda|        76|
+----+-------+----------+

+----+-------+----------+
| age|   name|triple_age|
+----+-------+----------+
|NULL|Michael|      NULL|
|  30|   Andy|        90|
|  19| Justin|        57|
|  25|  Sarah|        75|
|NULL|  David|      NULL|
|  42|Jessica|       126|
|  35|  Chris|       105|
|  22|  Emily|        66|
|NULL| Daniel|      NULL|
|  51|  Laura|       153|
|  29| Robert|        87|
|  38|  Linda|       114|
+----+-------+----------+



In [23]:
# New column not added, since the above operation is not an inplace operation
df.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



In [27]:
# Renaming a column
df.withColumnRenamed('age', 'new_age_renamed').show()

+---------------+-------+
|new_age_renamed|   name|
+---------------+-------+
|           NULL|Michael|
|             30|   Andy|
|             19| Justin|
|             25|  Sarah|
|           NULL|  David|
|             42|Jessica|
|             35|  Chris|
|             22|  Emily|
|           NULL| Daniel|
|             51|  Laura|
|             29| Robert|
|             38|  Linda|
+---------------+-------+



In [28]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



# Spark in SQL 

In [29]:
df.createOrReplaceTempView('people') # Registering it as a SQL temporaroy view

In [30]:
results = spark.sql("SELECT * FROM people")


+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
|  25|  Sarah|
|NULL|  David|
|  42|Jessica|
|  35|  Chris|
|  22|  Emily|
|NULL| Daniel|
|  51|  Laura|
|  29| Robert|
|  38|  Linda|
+----+-------+



In [23]:
results = spark.sql("SELECT age FROM people WHERE name = 'Andy'")
results.show()

+---+
|age|
+---+
| 30|
+---+



In [34]:
results = spark.sql("SELECT * FROM people " \
"WHERE age is NOT NULL ")
results.show()

+---+-------+
|age|   name|
+---+-------+
| 30|   Andy|
| 19| Justin|
| 25|  Sarah|
| 42|Jessica|
| 35|  Chris|
| 22|  Emily|
| 51|  Laura|
| 29| Robert|
| 38|  Linda|
+---+-------+



# Data Filtering in Spark Dataframes

In [37]:
spark = SparkSession.builder.appName('ops').getOrCreate()

df = spark.read.csv('data/appl_stock.csv', header=True, inferSchema=True)

In [38]:
df.show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1980-12-12|0.128348|0.128906|0.128348|0.128348| 0.100323|469033600|
|1980-12-15| 0.12221| 0.12221|0.121652|0.121652| 0.095089|175884800|
|1980-12-16|0.113281|0.113281|0.112723|0.112723|  0.08811|105728000|
|1980-12-17|0.115513|0.116071|0.115513|0.115513| 0.090291| 86441600|
|1980-12-18|0.118862| 0.11942|0.118862|0.118862| 0.092908| 73449600|
|1980-12-19|0.126116|0.126674|0.126116|0.126116| 0.098578| 48630400|
|1980-12-22|0.132254|0.132813|0.132254|0.132254| 0.103376| 37363200|
|1980-12-23|0.137835|0.138393|0.137835|0.137835| 0.107739| 46950400|
|1980-12-24|0.145089|0.145647|0.145089|0.145089| 0.113409| 48003200|
|1980-12-26|0.158482| 0.15904|0.158482|0.158482| 0.123877| 55574400|
|1980-12-29|0.160714|0.161272|0.160714|0.160714| 0.125622| 93161600|
|1980-12-30|0.157366|0.157366|0.15

In [39]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: long (nullable = true)



In [40]:
df.describe().show()

25/10/24 14:16:44 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+------------------+-----------------+------------------+------------------+--------------------+
|summary|              Open|              High|              Low|             Close|         Adj Close|              Volume|
+-------+------------------+------------------+-----------------+------------------+------------------+--------------------+
|  count|             10409|             10409|            10409|             10409|             10409|               10409|
|   mean|13.959909858007512|14.111935729945243|13.80916341963685|13.966756942837936| 13.35033663012778|3.3217784717071766E8|
| stddev|30.169243690613175|30.514877824393086|29.83505503777735|30.191696361026302|29.911131975662823| 3.393344185734637E8|
|    min|          0.049665|          0.049665|         0.049107|          0.049107|          0.038384|                   0|
|    max|        182.630005|        182.940002|       179.119995|        182.009995|        181.778397|          7421640800|


In [44]:
df.explain()

== Physical Plan ==
FileScan csv [Date#391,Open#392,High#393,Low#394,Close#395,Adj Close#396,Volume#397L] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/jiaxinhe.he/Desktop/Github/pyspark-practice/data/appl_stoc..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Date:date,Open:double,High:double,Low:double,Close:double,Adj Close:double,Volume:bigint>




In [41]:
df.filter("Close < 500").show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1980-12-12|0.128348|0.128906|0.128348|0.128348| 0.100323|469033600|
|1980-12-15| 0.12221| 0.12221|0.121652|0.121652| 0.095089|175884800|
|1980-12-16|0.113281|0.113281|0.112723|0.112723|  0.08811|105728000|
|1980-12-17|0.115513|0.116071|0.115513|0.115513| 0.090291| 86441600|
|1980-12-18|0.118862| 0.11942|0.118862|0.118862| 0.092908| 73449600|
|1980-12-19|0.126116|0.126674|0.126116|0.126116| 0.098578| 48630400|
|1980-12-22|0.132254|0.132813|0.132254|0.132254| 0.103376| 37363200|
|1980-12-23|0.137835|0.138393|0.137835|0.137835| 0.107739| 46950400|
|1980-12-24|0.145089|0.145647|0.145089|0.145089| 0.113409| 48003200|
|1980-12-26|0.158482| 0.15904|0.158482|0.158482| 0.123877| 55574400|
|1980-12-29|0.160714|0.161272|0.160714|0.160714| 0.125622| 93161600|
|1980-12-30|0.157366|0.157366|0.15

In [42]:
df.filter("Close < 500").select('Open', 'Close').show()

+--------+--------+
|    Open|   Close|
+--------+--------+
|0.128348|0.128348|
| 0.12221|0.121652|
|0.113281|0.112723|
|0.115513|0.115513|
|0.118862|0.118862|
|0.126116|0.126116|
|0.132254|0.132254|
|0.137835|0.137835|
|0.145089|0.145089|
|0.158482|0.158482|
|0.160714|0.160714|
|0.157366|0.156808|
|0.152902|0.152344|
|0.154018|0.154018|
|0.151228| 0.15067|
|0.144531|0.143973|
|0.138393|0.137835|
|0.135603|0.135045|
|0.142299|0.142299|
|0.142299|0.141183|
+--------+--------+
only showing top 20 rows


In [45]:
df.filter("Close < 500").select(['Open', 'Close']).show()

+--------+--------+
|    Open|   Close|
+--------+--------+
|0.128348|0.128348|
| 0.12221|0.121652|
|0.113281|0.112723|
|0.115513|0.115513|
|0.118862|0.118862|
|0.126116|0.126116|
|0.132254|0.132254|
|0.137835|0.137835|
|0.145089|0.145089|
|0.158482|0.158482|
|0.160714|0.160714|
|0.157366|0.156808|
|0.152902|0.152344|
|0.154018|0.154018|
|0.151228| 0.15067|
|0.144531|0.143973|
|0.138393|0.137835|
|0.135603|0.135045|
|0.142299|0.142299|
|0.142299|0.141183|
+--------+--------+
only showing top 20 rows


In [49]:
df.filter(df['close'] < 500).select(['High', 'Close']).show()

+--------+--------+
|    High|   Close|
+--------+--------+
|0.128906|0.128348|
| 0.12221|0.121652|
|0.113281|0.112723|
|0.116071|0.115513|
| 0.11942|0.118862|
|0.126674|0.126116|
|0.132813|0.132254|
|0.138393|0.137835|
|0.145647|0.145089|
| 0.15904|0.158482|
|0.161272|0.160714|
|0.157366|0.156808|
|0.152902|0.152344|
|0.155134|0.154018|
|0.151228| 0.15067|
|0.144531|0.143973|
|0.138393|0.137835|
|0.135603|0.135045|
|0.142857|0.142299|
|0.142299|0.141183|
+--------+--------+
only showing top 20 rows


In [50]:
df.filter(df['close'] < 500).select(['High', 'Low', 'Open']).show()

+--------+--------+--------+
|    High|     Low|    Open|
+--------+--------+--------+
|0.128906|0.128348|0.128348|
| 0.12221|0.121652| 0.12221|
|0.113281|0.112723|0.113281|
|0.116071|0.115513|0.115513|
| 0.11942|0.118862|0.118862|
|0.126674|0.126116|0.126116|
|0.132813|0.132254|0.132254|
|0.138393|0.137835|0.137835|
|0.145647|0.145089|0.145089|
| 0.15904|0.158482|0.158482|
|0.161272|0.160714|0.160714|
|0.157366|0.156808|0.157366|
|0.152902|0.152344|0.152902|
|0.155134|0.154018|0.154018|
|0.151228| 0.15067|0.151228|
|0.144531|0.143973|0.144531|
|0.138393|0.137835|0.138393|
|0.135603|0.135045|0.135603|
|0.142857|0.142299|0.142299|
|0.142299|0.141183|0.142299|
+--------+--------+--------+
only showing top 20 rows


In [51]:
df.filter("Open < 500").select(['Open', 'High']).show()

+--------+--------+
|    Open|    High|
+--------+--------+
|0.128348|0.128906|
| 0.12221| 0.12221|
|0.113281|0.113281|
|0.115513|0.116071|
|0.118862| 0.11942|
|0.126116|0.126674|
|0.132254|0.132813|
|0.137835|0.138393|
|0.145089|0.145647|
|0.158482| 0.15904|
|0.160714|0.161272|
|0.157366|0.157366|
|0.152902|0.152902|
|0.154018|0.155134|
|0.151228|0.151228|
|0.144531|0.144531|
|0.138393|0.138393|
|0.135603|0.135603|
|0.142299|0.142857|
|0.142299|0.142299|
+--------+--------+
only showing top 20 rows


In [54]:
df.filter(df['Close'] < 0.12).filter(df['Open'] > 0.12).show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1981-05-14|0.121094|0.121094|0.119978|0.119978| 0.093781|  4928000|
|1984-01-09|0.123884|0.123884|0.113281|0.117188|   0.0916|215734400|
|1984-01-27|0.123326|0.123884|0.114397|0.116629| 0.091163|194096000|
|1984-02-23| 0.12221| 0.12221|0.116071|0.119978| 0.093781|155052800|
|1984-02-28|0.120536|0.121094|0.112165|0.113839| 0.088982|169926400|
|1984-03-05|0.121652| 0.12221|0.117746| 0.11942| 0.093345| 73606400|
|1984-03-14|0.120536|0.121094|0.118304|0.118862| 0.092908| 59606400|
|1984-06-26|0.121652| 0.12221|0.116071|0.116071| 0.090727|148646400|
|1984-07-30|0.121094|0.121652|0.112723|0.113839| 0.088982|125036800|
|1984-08-31|0.120536|0.121094|0.116629|0.118304| 0.092472|137849600|
|1984-09-21|0.121094|0.124442|0.118304|0.119978| 0.093781| 99836800|
|1984-12-10|0.121652|0.121652| 0.1

In [55]:
df.filter((df['Close'] < 0.12) & (df['Open'] > 0.12)).show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1981-05-14|0.121094|0.121094|0.119978|0.119978| 0.093781|  4928000|
|1984-01-09|0.123884|0.123884|0.113281|0.117188|   0.0916|215734400|
|1984-01-27|0.123326|0.123884|0.114397|0.116629| 0.091163|194096000|
|1984-02-23| 0.12221| 0.12221|0.116071|0.119978| 0.093781|155052800|
|1984-02-28|0.120536|0.121094|0.112165|0.113839| 0.088982|169926400|
|1984-03-05|0.121652| 0.12221|0.117746| 0.11942| 0.093345| 73606400|
|1984-03-14|0.120536|0.121094|0.118304|0.118862| 0.092908| 59606400|
|1984-06-26|0.121652| 0.12221|0.116071|0.116071| 0.090727|148646400|
|1984-07-30|0.121094|0.121652|0.112723|0.113839| 0.088982|125036800|
|1984-08-31|0.120536|0.121094|0.116629|0.118304| 0.092472|137849600|
|1984-09-21|0.121094|0.124442|0.118304|0.119978| 0.093781| 99836800|
|1984-12-10|0.121652|0.121652| 0.1

In [56]:
df.filter(df['Low'] == 0.142299).show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1981-01-09|0.142299|0.142857|0.142299|0.142299| 0.111228| 21504000|
|1981-01-20|0.142857|0.142857|0.142299|0.142299| 0.111228| 30083200|
|1982-12-01|0.142299| 0.15067|0.142299|0.145089| 0.113409|206841600|
+----------+--------+--------+--------+--------+---------+---------+



In [57]:
result = df.filter(df['Low'] == 0.142299).collect()

In [68]:
row = result[0]
# print( result)
row.asDict()

[Row(Date=datetime.date(1981, 1, 9), Open=0.142299, High=0.142857, Low=0.142299, Close=0.142299, Adj Close=0.111228, Volume=21504000), Row(Date=datetime.date(1981, 1, 20), Open=0.142857, High=0.142857, Low=0.142299, Close=0.142299, Adj Close=0.111228, Volume=30083200), Row(Date=datetime.date(1982, 12, 1), Open=0.142299, High=0.15067, Low=0.142299, Close=0.145089, Adj Close=0.113409, Volume=206841600)]


{'Date': datetime.date(1981, 1, 9),
 'Open': 0.142299,
 'High': 0.142857,
 'Low': 0.142299,
 'Close': 0.142299,
 'Adj Close': 0.111228,
 'Volume': 21504000}

In [61]:
# row.asDict()['Volume']
row.asDict()['Open']
# row.asDict()['High']

0.142299

# Data Aggregation in Spark Dataframes

In [69]:
from pyspark.sql import SparkSession

In [70]:
spark = SparkSession.builder.appName('aggs').getOrCreate()

25/10/24 15:31:27 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [71]:
df = spark.read.csv('data/sales_info.csv', header=True, inferSchema=True)

In [72]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|  200|
|   GOOG|Charlie|  120|
|   MSFT|    Amy|  340|
|   MSFT|Vanessa|  124|
|     FB|   Carl|  243|
|     FB|  Sarah|  350|
|   APPL|   John|  250|
|   APPL|  Linda|  130|
|   APPL|   Mike|  750|
|   APPL|  Chris|  350|
+-------+-------+-----+



In [73]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: integer (nullable = true)



In [74]:
df.groupBy("Company")

GroupedData[grouping expressions: [Company], value: [Company: string, Person: string ... 1 more field], type: GroupBy]

In [75]:
df.groupBy("Company").mean().show()

+-------+----------+
|Company|avg(Sales)|
+-------+----------+
|   APPL|     370.0|
|   GOOG|     160.0|
|     FB|     296.5|
|   MSFT|     232.0|
+-------+----------+



In [76]:
df.groupBy("Company").count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    2|
|     FB|    2|
|   MSFT|    2|
+-------+-----+



In [78]:
df.groupBy("Company").max().show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|       750|
|   GOOG|       200|
|     FB|       350|
|   MSFT|       340|
+-------+----------+



In [79]:
df.agg({'Sales':'sum'}).show()

+----------+
|sum(Sales)|
+----------+
|      2857|
+----------+



In [80]:
df.agg({'Sales':'max'}).show()

+----------+
|max(Sales)|
+----------+
|       750|
+----------+



In [81]:
df.agg({'Sales':'min'}).show()

+----------+
|min(Sales)|
+----------+
|       120|
+----------+



In [82]:
df.agg({'Sales':'avg'}).show()

+----------+
|avg(Sales)|
+----------+
|     285.7|
+----------+



In [83]:
group_data = df.groupBy("Company")
group_data.agg({'Sales':'max'}).show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|       750|
|   GOOG|       200|
|     FB|       350|
|   MSFT|       340|
+-------+----------+



---
### 📍 BOOKMARK: Continue from here
Last session ended: Data Aggregation section
Next: Work on functions and advanced aggregations
---

In [51]:
from pyspark.sql.functions import countDistinct,avg,stddev

df.select(avg('Sales').alias('Average Sales')).show()

+-----------------+
|    Average Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [52]:
df.select(countDistinct('Sales')).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                   11|
+---------------------+



In [53]:
df.orderBy('Sales').show() # OrderBy Ascending

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [54]:
df.orderBy(df['Sales'].desc()).show() # OrderBy Descending

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



# Handling Missing Data

In [55]:
df = spark.read.csv('/kaggle/input/null-data/ContainsNull.csv', inferSchema=True, header=True)

In [56]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [57]:
df.na.drop().show() # Drops all rows affected with atleast one Null Values

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [58]:
df.na.drop(thresh=2).show() # Drops rows affected with atleast 2 Null Values

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [59]:
df.na.drop(how='all').show() # Drop Rows if all values are Null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [60]:
df.na.drop(how='any').show() # Drop Rows if any value is Null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [61]:
df.na.drop(subset=['Sales']).show() # Drop only those rows where Sales is Null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [62]:
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [63]:
df.na.fill('unknown').show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|unknown| null|
|emp3|unknown|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [64]:
df.na.fill('unknown', subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|unknown| null|
|emp3|unknown|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [65]:
from pyspark.sql.functions import mean

In [66]:
mean_val = df.select(mean(df['Sales'])).collect()

In [67]:
mean_sales = mean_val[0][0]

In [68]:
df.na.fill(mean_sales, subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# Dates and Timestamps in Spark Dataframe

In [69]:
df = spark.read.csv('/kaggle/input/apple-stock/appl_stock.csv', inferSchema=True, header=True)

df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [70]:
df.head(1)

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)]

In [71]:
df.select(['Open', 'Date', 'Close']).show()

+------------------+-------------------+------------------+
|              Open|               Date|             Close|
+------------------+-------------------+------------------+
|        213.429998|2010-01-04 00:00:00|        214.009998|
|        214.599998|2010-01-05 00:00:00|        214.379993|
|        214.379993|2010-01-06 00:00:00|        210.969995|
|            211.75|2010-01-07 00:00:00|            210.58|
|        210.299994|2010-01-08 00:00:00|211.98000499999998|
|212.79999700000002|2010-01-11 00:00:00|210.11000299999998|
|209.18999499999998|2010-01-12 00:00:00|        207.720001|
|        207.870005|2010-01-13 00:00:00|        210.650002|
|210.11000299999998|2010-01-14 00:00:00|            209.43|
|210.92999500000002|2010-01-15 00:00:00|            205.93|
|        208.330002|2010-01-19 00:00:00|        215.039995|
|        214.910006|2010-01-20 00:00:00|            211.73|
|        212.079994|2010-01-21 00:00:00|        208.069996|
|206.78000600000001|2010-01-22 00:00:00|

In [72]:
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format)

df.select(dayofmonth(df['Date']).alias('Day'),
          month(df['Date']).alias('Month'),
          year(df['Date']).alias('Year')).show()

+---+-----+----+
|Day|Month|Year|
+---+-----+----+
|  4|    1|2010|
|  5|    1|2010|
|  6|    1|2010|
|  7|    1|2010|
|  8|    1|2010|
| 11|    1|2010|
| 12|    1|2010|
| 13|    1|2010|
| 14|    1|2010|
| 15|    1|2010|
| 19|    1|2010|
| 20|    1|2010|
| 21|    1|2010|
| 22|    1|2010|
| 25|    1|2010|
| 26|    1|2010|
| 27|    1|2010|
| 28|    1|2010|
| 29|    1|2010|
|  1|    2|2010|
+---+-----+----+
only showing top 20 rows



In [73]:
df_updated = df.withColumn('Year', year(df['Date']))

df_updated.groupBy('Year').mean().select(['Year', "avg(Close)"]).show()

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
|2010| 259.8424600000002|
|2011|364.00432532142867|
+----+------------------+



In [74]:
df = spark.read.csv('/kaggle/input/walmart-dataset/walmart_stock.csv', inferSchema=True, header=True)

In [75]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+--------+------------------+
|               Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03 00:00:00|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04 00:00:00|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05 00:00:00|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06 00:00:00|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09 00:00:00|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000

In [76]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [77]:
display(df.head(5))

[Row(Date=datetime.datetime(2012, 1, 3, 0, 0), Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002, Volume=12668800, Adj Close=52.619234999999996),
 Row(Date=datetime.datetime(2012, 1, 4, 0, 0), Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996, Volume=9593300, Adj Close=52.078475),
 Row(Date=datetime.datetime(2012, 1, 5, 0, 0), Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998, Volume=12768200, Adj Close=51.825539),
 Row(Date=datetime.datetime(2012, 1, 6, 0, 0), Open=59.419998, High=59.450001, Low=58.869999, Close=59.0, Volume=8069400, Adj Close=51.45922),
 Row(Date=datetime.datetime(2012, 1, 9, 0, 0), Open=59.029999, High=59.549999, Low=58.919998, Close=59.18, Volume=6679300, Adj Close=51.616215000000004)]

In [78]:
df.describe().show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

In [79]:
df.select((df['High']/df['Volume']).alias('HV Ratio')).show()

+--------------------+
|            HV Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
|8.644477436914568E-6|
|9.351828421515645E-6|
| 8.29141562102703E-6|
|7.712212102001476E-6|
|7.071764823529412E-6|
|1.015495466386981E-5|
|6.576354146362592...|
| 5.90145296180676E-6|
|8.547679455011844E-6|
|8.420709512685392E-6|
|1.041448341728929...|
|8.316075414862431E-6|
|9.721183814992126E-6|
|8.029436027707578E-6|
|6.307432259386365E-6|
+--------------------+
only showing top 20 rows



In [80]:
df.createOrReplaceTempView('walmart')

In [81]:
k = spark.sql('SELECT Date FROM walmart WHERE High = (SELECT MAX(High) FROM walmart)')
k.show()

+-------------------+
|               Date|
+-------------------+
|2015-01-13 00:00:00|
+-------------------+



In [82]:
from pyspark.sql.functions import mean,round, max, min

df.select(round(mean(df['Close']), 2).alias('Average Closing Price')).show()

+---------------------+
|Average Closing Price|
+---------------------+
|                72.39|
+---------------------+



In [83]:
df.select(max(df['Volume']).alias('Maximum Volume'), min(df['Volume']).alias('Minimum Volume')).show()

+--------------+--------------+
|Maximum Volume|Minimum Volume|
+--------------+--------------+
|      80898100|       2094900|
+--------------+--------------+



# Linear Regression

In [84]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.appName('Linear Regression Example').getOrCreate()

23/03/22 13:48:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [85]:
training = spark.read.format('libsvm').load('/kaggle/input/linear-reg/sample_linear_regression_data.txt')

23/03/22 13:48:35 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [86]:
 training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [87]:
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')

In [88]:
lr_model = lr.fit(training)

23/03/22 13:48:36 WARN Instrumentation: [ca59ab54] regParam is zero, which might cause numerical instability and overfitting.


In [89]:
lr_model.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [90]:
lr_model.intercept

0.14228558260358093

In [91]:
training_summary = lr_model.summary

In [92]:
print(training_summary.r2)
print(training_summary.rootMeanSquaredError)

0.027839179518600154
10.16309157133015


In [93]:
all_data = spark.read.format('libsvm').load('/kaggle/input/linear-reg/sample_linear_regression_data.txt')

train, test = all_data.randomSplit([0.7, 0.3])


23/03/22 13:48:38 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [94]:
train.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                346|
|   mean|0.38718282905994267|
| stddev| 10.547197679747212|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [95]:
test.describe().show()

+-------+--------------------+
|summary|               label|
+-------+--------------------+
|  count|                 155|
|   mean|-0.03396102538742036|
| stddev|   9.813657929980804|
|    min| -22.837460416919342|
|    max|  27.111027963108548|
+-------+--------------------+



In [96]:
correct_model = lr.fit(train)

23/03/22 13:48:39 WARN Instrumentation: [fceb059a] regParam is zero, which might cause numerical instability and overfitting.


In [97]:
result = correct_model.evaluate(test)

print(result.rootMeanSquaredError)
print(result.r2)

10.017598939294926
-0.048760752319122025


In [98]:
unlabeled_data = test.select('features')
predictions = correct_model.transform(unlabeled_data)

In [99]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  -3.127773663860553|
|(10,[0,1,2,3,4,5,...|  1.6881636681447239|
|(10,[0,1,2,3,4,5,...| 0.24640815983713976|
|(10,[0,1,2,3,4,5,...|   2.105659966042688|
|(10,[0,1,2,3,4,5,...|  -1.169484186977471|
|(10,[0,1,2,3,4,5,...| -1.8206162296128197|
|(10,[0,1,2,3,4,5,...|  0.7850285287671219|
|(10,[0,1,2,3,4,5,...| 0.46019525634398684|
|(10,[0,1,2,3,4,5,...| 0.07778395022877493|
|(10,[0,1,2,3,4,5,...|  0.7448539880942082|
|(10,[0,1,2,3,4,5,...| 0.14002040055577314|
|(10,[0,1,2,3,4,5,...|   2.361226734051958|
|(10,[0,1,2,3,4,5,...|   3.765200081576966|
|(10,[0,1,2,3,4,5,...| -0.9186850194445468|
|(10,[0,1,2,3,4,5,...|  0.9065518377678516|
|(10,[0,1,2,3,4,5,...|   1.506037086060299|
|(10,[0,1,2,3,4,5,...| -1.8018227670246998|
|(10,[0,1,2,3,4,5,...|  2.2614952304186553|
|(10,[0,1,2,3,4,5,...|-0.21697435638175416|
|(10,[0,1,2,3,4,5,...|  2.458236

# How to use Linear Regression in Spark

In [100]:
from pyspark.sql import SparkSession

In [101]:
spark = SparkSession.builder.appName('lr_implementation').getOrCreate()

23/03/22 13:48:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [102]:
df = spark.read.csv('/kaggle/input/e-commerce-data/Ecommerce_Customers.csv', inferSchema=True, header=True)

df.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [103]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [104]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [105]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App',
                                       'Time on Website', 'Length of Membership'], outputCol='features')


output = assembler.transform(df)

In [106]:
output.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [107]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [108]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [109]:
train, test = final_data.randomSplit([0.7, 0.3])

train.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.3931845423455...|  319.9288698031936|
|[30.5743636841713...| 442.06441375806565|
|[30.7377203726281...|  461.7807421962299|
|[30.8162006488763...|   266.086340948469|
|[30.8364326747734...|  467.5019004269896|
|[30.8794843441274...|  490.2065999848547|
|[30.9716756438877...|  494.6386097568927|
|[31.0472221394875...|  392.4973991890214|
|[31.0613251567161...|  487.5554580579016|
|[31.1239743499119...|  486.9470538397658|
|[31.1280900496166...|  557.2526867470547|
|[31.1695067987115...|  427.3565308022928|
|[31.2834474760581...|  591.7810894256675|
|[31.3091926408918...|  432.7207178399336|
|[31.3584771924370...|  495.1759504494754|
|[31.3662121671876...|  430.5888825564849|
|[31.3895854806643...|  410.0696110599829|
|[31.4459724827577...| 484.87696493512857|
|[31.5147378578019...|  489.8124879964614|
|[31.5257524169682...|  443.9656268098819|
+----------

In [110]:
train.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                351|
|   mean|  498.2960902479787|
| stddev|   76.1704543050622|
|    min|   266.086340948469|
|    max|  744.2218671047146|
+-------+-------------------+



In [111]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', predictionCol='Predicted Yearly Amount Spent')

In [112]:
lr_model = lr.fit(train)

23/03/22 13:48:42 WARN Instrumentation: [2c8e1c8c] regParam is zero, which might cause numerical instability and overfitting.


In [113]:
pred = lr_model.evaluate(test)
pred.r2

0.9854323441494157

In [114]:
pred.rootMeanSquaredError

10.404474594233239

In [115]:
df.describe().show()

+-------+-----------------+--------------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|summary|            Email|             Address|     Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+-------+-----------------+--------------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|  count|              500|                 500|        500|               500|               500|               500|                 500|                500|
|   mean|             null|                null|       null| 33.05319351819619|12.052487937166134| 37.06044542094859|   3.533461555915055|  499.3140382585909|
| stddev|             null|                null|       null|0.9925631110845354|0.9942156084725424|1.0104889067564033|  0.9992775024112585|   79.3147815497068|
|    min|aaron04@yahoo.com|0001 Mack MillNor..

In [116]:
pred.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 10.193396704202314|
| -4.008824546384233|
|-12.695712328856814|
|-0.1998173484663539|
|  -2.91935629967611|
|  18.44089512656211|
|-3.9566226809558884|
| -8.034141163400022|
|  -4.18597125565185|
| -14.25149601128328|
| -7.794639646636938|
| -5.590927517401212|
|-17.156709285265663|
|  7.676298819273313|
|  12.32034652261865|
| -6.104736449012648|
|  8.406288411658693|
| -17.82246647004922|
|-4.9214358923010195|
|  6.112652291823679|
+-------------------+
only showing top 20 rows



# Linear Regression Excercise

In [117]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.appName('Practice').getOrCreate()

cruise_data = spark.read.csv('/kaggle/input/cruise-data/cruise_ship_info.csv', inferSchema=True, header=True)

print(cruise_data.show())

print(cruise_data.printSchema())

print(cruise_data.describe().show())

print(cruise_data.columns)


assembler = VectorAssembler(inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'], outputCol='features')

cruise_data = assembler.transform(cruise_data)

print(cruise_data.show())

train, test = cruise_data.randomSplit([0.7, 0.3])

lr = LinearRegression(featuresCol='features', labelCol='crew', predictionCol='Estimated Number of Crew')

lr_model = lr.fit(train)
 
pred = lr_model.evaluate(test)

print("R squared attained on set is {x}".format(x = pred.r2))
print("RMS Error attained on the test set is {y}".format(y = pred.rootMeanSquaredError))

23/03/22 13:48:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    El

In [118]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer

spark = SparkSession.builder.appName('Practice').getOrCreate()

cruise_data = spark.read.csv('/kaggle/input/cruise-data/cruise_ship_info.csv', inferSchema=True, header=True)

print(cruise_data.show())

print(cruise_data.printSchema())

print(cruise_data.describe().show())

print(cruise_data.columns)


indexer = StringIndexer(inputCol='Cruise_line', outputCol='Categorical_Cruise_line')

cruise_data = indexer.fit(cruise_data).transform(cruise_data)

assembler = VectorAssembler(inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'Categorical_Cruise_line'], outputCol='features')

cruise_data = assembler.transform(cruise_data)

print(cruise_data.show())

train, test = cruise_data.randomSplit([0.7, 0.3])

lr = LinearRegression(featuresCol='features', labelCol='crew', predictionCol='Estimated Number of Crew')

lr_model = lr.fit(train)
 
pred = lr_model.evaluate(test)

print("R squared attained on set is {x}".format(x = pred.r2))
print("RMS Error attained on the test set is {y}".format(y = pred.rootMeanSquaredError))

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

# Logistic Regression in Spark

In [119]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Logit').getOrCreate()

23/03/22 13:48:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [120]:
ttd = spark.read.csv('/kaggle/input/Titanic/titanic.csv', inferSchema=True, header=True)

In [121]:
ttd.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [122]:
ttd.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [123]:
ttd.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [124]:
ttd.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [125]:
cols = ttd.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])

In [126]:
final_data = cols.na.drop()

In [127]:
final_data.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [128]:
final_data.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

In [129]:
from pyspark.ml.feature import (StringIndexer, OneHotEncoder,
                                VectorAssembler, VectorIndexer)

gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

# String Indexer
# Example
# A B C
# 0 1 2
# ONE HOT ENCODE
# KEY A B C
# For A, One hot encode looks like below
# [1, 0, 0]
# For B, One hot encode looks like below
# [0, 1, 0]

In [130]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

In [131]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec'], outputCol='features')

In [132]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived', predictionCol='Predicted_Survived')

pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [133]:
train, test = final_data.randomSplit([0.7, 0.3])

In [134]:
lr_model = pipeline.fit(train)

In [135]:
results = lr_model._transform(test)
my_eval = BinaryClassificationEvaluator(rawPredictionCol='Predicted_Survived', labelCol='Survived')

In [136]:
results.select('Survived', 'Predicted_Survived').show()

+--------+------------------+
|Survived|Predicted_Survived|
+--------+------------------+
|       0|               0.0|
|       0|               1.0|
|       0|               1.0|
|       0|               1.0|
|       0|               0.0|
|       0|               1.0|
|       0|               0.0|
|       0|               1.0|
|       0|               1.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
|       0|               0.0|
+--------+------------------+
only showing top 20 rows



In [137]:
AUC = my_eval.evaluate(results)
AUC

0.7997169143665959

# Logistic Regression Practice

In [138]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer

spark=SparkSession.builder.appName('Logit').getOrCreate()
df = spark.read.csv('/kaggle/input/churn-data/customer_churn.csv', inferSchema=True, header=True)

print(df.show())

print(df.printSchema())

print(df.describe().show())

train, test = df.randomSplit([0.7, 0.3])

assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'], outputCol='features')

lr = LogisticRegression(featuresCol='features', labelCol='Churn', predictionCol='Predicted_Churn')

pipeline = Pipeline(stages=[assembler, lr])

lr_model_pipeline = pipeline.fit(train)

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [139]:
results = lr_model_pipeline.transform(test)
results.show()

my_eval = BinaryClassificationEvaluator(rawPredictionCol='Predicted_Churn', labelCol='Churn')
AUC = my_eval.evaluate(results)
AUC

+-----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+---------------+
|            Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|            features|       rawPrediction|         probability|Predicted_Churn|
+-----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+---------------+
|      Aaron Meyer|45.0|       9598.03|              0|  5.0|      7.0|2010-07-17 03:30:38|35821 Bailey Skyw...|Steele, Bates and...|    0|[45.0,9598.03,5.0...|[4.40743476302228...|[0.98796032106432...|            0.0|
|    Adriana James|36.0|      10448.09|              0| 4.13|      8.0|2016-04-09 09:02:14|USNV Ferguson FPO...|Doyle, Butle

0.8427586206896551

In [140]:
results.select('Company', 'Predicted_Churn').show()

+--------------------+---------------+
|             Company|Predicted_Churn|
+--------------------+---------------+
|Steele, Bates and...|            0.0|
|Doyle, Butler and...|            0.0|
|      Jones and Sons|            0.0|
|      Robinson-Perez|            1.0|
|         Hoffman Ltd|            0.0|
|Barry, Brown and ...|            0.0|
|        Jones-Fisher|            0.0|
|Davis, Mccormick ...|            0.0|
|         Larson-Hall|            1.0|
|          Knox-Davis|            0.0|
|          Parker PLC|            0.0|
|      Johnson-Nelson|            0.0|
|        Patton Group|            0.0|
|       Blackwell PLC|            0.0|
|          Bailey LLC|            0.0|
|Briggs, Cross and...|            0.0|
|Cruz, Carter and ...|            0.0|
|          Morris LLC|            0.0|
|      Miller-Ramirez|            0.0|
|         Hancock LLC|            0.0|
+--------------------+---------------+
only showing top 20 rows



# Decision Trees, Random Forest, GTB in Spark

In [141]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

spark = SparkSession.builder.appName('Decision Tree Tutorial').getOrCreate()
df = spark.read.format('libsvm').load('/kaggle/input/sample-data-for-dt/sample_libsvm_data.txt')

df.show()

23/03/22 13:48:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/03/22 13:48:56 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|


In [142]:
train, test = df.randomSplit([0.7,0.3])

In [143]:
dt_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')
random_forest_classifier = RandomForestClassifier(numTrees=100, featuresCol='features', labelCol='label', predictionCol='prediction')
boosted_trees = GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [144]:
dtc = dt_classifier.fit(train)
rfc = random_forest_classifier.fit(train)
btc = boosted_trees.fit(train)

In [145]:
dtc_preds = dtc.transform(test)
rfc_preds = rfc.transform(test)
btc_preds = btc.transform(test)

In [146]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[150,151,152...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[151,152,153...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [29.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [147]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|  [62.0,38.0]|[0.62,0.38]|       0.0|
|  0.0|(692,[122,123,124...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[126,127,128...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[126,127,128...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(692,[126,127,128...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(692,[150,151,152...|  [87.0,13.0]|[0.87,0.13]|       0.0|
|  0.0|(692,[151,152,153...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[152,153,154...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[153,154,155...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(69

In [148]:
btc_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,124...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[150,151,152...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[151

In [149]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

print(acc_eval.evaluate(dtc_preds))
print(acc_eval.evaluate(rfc_preds))
print(acc_eval.evaluate(btc_preds))

0.9655172413793104
0.9655172413793104
0.9655172413793104


In [150]:
rfc.featureImportances

SparseVector(692, {127: 0.0018, 155: 0.003, 204: 0.002, 206: 0.0006, 213: 0.0007, 231: 0.0008, 232: 0.0028, 243: 0.0005, 244: 0.0175, 245: 0.0031, 262: 0.0006, 264: 0.0013, 272: 0.0007, 273: 0.0007, 289: 0.0086, 299: 0.0039, 300: 0.0274, 322: 0.007, 323: 0.0134, 324: 0.0018, 327: 0.0012, 328: 0.0178, 329: 0.0092, 330: 0.0007, 331: 0.0025, 341: 0.001, 342: 0.0042, 344: 0.0147, 346: 0.0036, 350: 0.0203, 351: 0.0185, 353: 0.001, 357: 0.0128, 358: 0.0214, 359: 0.0064, 370: 0.001, 372: 0.0067, 373: 0.0052, 374: 0.0085, 375: 0.0036, 377: 0.0299, 378: 0.0116, 379: 0.0349, 382: 0.0007, 386: 0.0157, 387: 0.0014, 397: 0.0003, 398: 0.0059, 405: 0.0159, 406: 0.021, 407: 0.0187, 413: 0.0035, 414: 0.0075, 415: 0.0012, 426: 0.0066, 427: 0.02, 429: 0.0151, 433: 0.042, 434: 0.05, 440: 0.0098, 441: 0.0065, 444: 0.0012, 454: 0.0384, 455: 0.0199, 456: 0.0081, 459: 0.0007, 461: 0.0304, 462: 0.0211, 463: 0.0006, 467: 0.0014, 469: 0.0082, 482: 0.0142, 483: 0.0012, 484: 0.0098, 487: 0.0005, 489: 0.0324, 490: 

# Tree Based Method

In [151]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Tree Methods').getOrCreate()

df = spark.read.csv('/kaggle/input/college-data/College.csv', inferSchema=True, header=True)

df.show()

23/03/22 13:49:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|     

In [152]:
df.describe().show()

+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+
|summary|              School|Private|              Apps|            Accept|          Enroll|         Top10perc|         Top25perc|      F_Undergrad|      P_Undergrad|          Outstate|        Room_Board|             Books|          Personal|               PhD|          Terminal|         S_F_Ratio|       perc_alumni|          Expend|         Grad_Rate|
+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

In [153]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [154]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [155]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

assembler = VectorAssembler(inputCols=['Accept', 'Enroll', 'Top10perc', 'Top25perc',
                            'F_Undergrad','P_Undergrad','Outstate','Room_Board','Books','Personal','PhD','Terminal',
                            'S_F_Ratio','perc_alumni','Expend','Grad_Rate'], outputCol='features')

In [156]:
output = assembler.transform(df)

indexer = StringIndexer(inputCol='Private', outputCol='Indexed_Private')
output_fixed = indexer.fit(output).transform(output)

output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- Indexed_Private: double (nullable = false)



In [157]:
train, test = output_fixed.select(['features', 'Indexed_Private']).randomSplit([0.7, 0.3])

In [158]:
train.show()

23/03/22 13:49:04 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------------------+---------------+
|            features|Indexed_Private|
+--------------------+---------------+
|[72.0,51.0,33.0,7...|            0.0|
|[118.0,55.0,12.0,...|            0.0|
|[128.0,75.0,17.0,...|            0.0|
|[146.0,55.0,16.0,...|            0.0|
|[155.0,75.0,28.0,...|            0.0|
|[158.0,132.0,10.0...|            0.0|
|[165.0,63.0,5.0,2...|            0.0|
|[166.0,85.0,13.0,...|            0.0|
|[182.0,99.0,7.0,2...|            0.0|
|[184.0,122.0,19.0...|            0.0|
|[185.0,91.0,16.0,...|            0.0|
|[192.0,111.0,15.0...|            0.0|
|[197.0,124.0,3.0,...|            0.0|
|[198.0,82.0,12.0,...|            0.0|
|[201.0,97.0,10.0,...|            0.0|
|[223.0,103.0,10.0...|            0.0|
|[228.0,137.0,10.0...|            0.0|
|[233.0,153.0,5.0,...|            1.0|
|

In [159]:
from pyspark.ml.classification import (DecisionTreeClassifier,
                                       RandomForestClassifier,
                                       GBTClassifier)

In [160]:
from pyspark.ml import pipeline

dtc = DecisionTreeClassifier(labelCol='Indexed_Private', featuresCol='features')
rfc = RandomForestClassifier(labelCol='Indexed_Private', featuresCol='features')
gbt = GBTClassifier(labelCol='Indexed_Private', featuresCol='features')

dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

dt_pred = dtc_model.transform(test)
rf_pred = rfc_model.transform(test)
gb_pred = gbt_model.transform(test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_binary_eval = BinaryClassificationEvaluator(labelCol='Indexed_Private')

In [161]:
print('DTC Prediction AUC ROC is: {x}'.format(x = my_binary_eval.evaluate(dt_pred)))
print('RFC Prediction AUC ROC is: {x}'.format(x = my_binary_eval.evaluate(rf_pred)))
print('GBT Prediction AUC ROC is: {x}'.format(x = my_binary_eval.evaluate(gb_pred)))

DTC Prediction AUC ROC is: 0.918927738927739
RFC Prediction AUC ROC is: 0.9688578088578086
GBT Prediction AUC ROC is: 0.9516083916083916


# RFC Consulting Project

In [162]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('Consulting').getOrCreate()

df = spark.read.csv('/kaggle/input/dog-food/dog_food.csv', inferSchema=True, header=True)

df.show()

df.printSchema()

df.describe().show()

23/03/22 13:49:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)

+-------+-----------------

In [163]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features') 
df = assembler.transform(df)
df.show()

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

In [164]:
df_clean = df.select(['features', 'Spoiled'])
train, test = df_clean.randomSplit([0.7, 0.3])

In [165]:
rfm = RandomForestClassifier(featuresCol='features', labelCol='Spoiled')
rfm_model = rfm.fit(train)
rfm_pred = rfm_model.transform(test)

In [166]:
rfm_pred.show()

+-------------------+-------+--------------------+--------------------+----------+
|           features|Spoiled|       rawPrediction|         probability|prediction|
+-------------------+-------+--------------------+--------------------+----------+
|  [1.0,2.0,9.0,4.0]|    0.0|[18.6522457276251...|[0.93261228638125...|       0.0|
|  [1.0,3.0,8.0,5.0]|    0.0|[19.8933161269102...|[0.99466580634551...|       0.0|
|  [1.0,4.0,8.0,1.0]|    0.0|[19.6779789744822...|[0.98389894872411...|       0.0|
|  [1.0,4.0,8.0,5.0]|    0.0|[19.8933161269102...|[0.99466580634551...|       0.0|
|  [1.0,4.0,8.0,7.0]|    0.0|[18.8856238192179...|[0.94428119096089...|       0.0|
|  [1.0,4.0,9.0,3.0]|    0.0|[19.6779789744822...|[0.98389894872411...|       0.0|
|[1.0,4.0,13.0,10.0]|    1.0|[0.56489092188599...|[0.02824454609429...|       1.0|
|[1.0,5.0,12.0,10.0]|    1.0|[1.45377981077488...|[0.07268899053874...|       1.0|
|[1.0,5.0,13.0,10.0]|    1.0|[1.45377981077488...|[0.07268899053874...|       1.0|
|[1.

In [167]:
# Predictions are depending on C majorly
# Model states C feature to be more effective in estimationg Spoiled food
rfm_model.featureImportances

SparseVector(4, {0: 0.0241, 1: 0.0288, 2: 0.9211, 3: 0.0259})

In [168]:
my_eval = BinaryClassificationEvaluator(labelCol='Spoiled')
my_eval.evaluate(rfm_pred)

0.9880811662726554

# Clustering

In [169]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Clustering').getOrCreate()

df = spark.read.csv('/kaggle/input/seed-dataset/seeds_dataset.csv', inferSchema=True, header=True)

df.printSchema()

df.describe().show()

df.show()

23/03/22 13:49:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210| 

In [170]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [171]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=df.columns, outputCol='features')

df = assembler.transform(df)

df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355| 

In [172]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

scaled_data = scaler.fit(df).transform(df)

In [173]:
scaled_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|     scaled_features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [174]:
km = KMeans(k=3, featuresCol='scaled_features')

model = km.fit(scaled_data)

model.clusterCenters()

[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107]),
 array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
         3.15410901, 10.38031464]),
 array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
         1.80061978, 10.41913733])]

In [175]:
model.transform(scaled_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows



In [176]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans

spark = SparkSession.builder.appName('KMeans').getOrCreate()
df = spark.read.csv('/kaggle/input/hack-data/hack_data.csv', inferSchema=True, header=True)

df.show()

df.printSchema()

df.describe().show()

df.na.drop()

23/03/22 13:49:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|           

DataFrame[Session_Connection_Time: double, Bytes Transferred: double, Kali_Trace_Used: int, Servers_Corrupted: double, Pages_Corrupted: double, Location: string, WPM_Typing_Speed: double]

In [177]:
df.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [178]:
assembler = VectorAssembler(inputCols=['Session_Connection_Time',
                                         'Bytes Transferred',
                                         'Kali_Trace_Used',
                                         'Servers_Corrupted',
                                         'Pages_Corrupted',
                                         'WPM_Typing_Speed'], outputCol='feature')

df_new = assembler.transform(df)

In [179]:
df_new.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|             feature|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|[31.0,356.32,1.0,...|
|                    2.0|           228.08|              1|             2.48|     

In [180]:
Scaler = StandardScaler(inputCol='feature', outputCol='scaled_features')

df2 = Scaler.fit(df_new).transform(df_new)

In [181]:
df2.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|             feature|     scaled_features|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|[0.56785108466505...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|[1.41962771166263...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58

In [182]:
km = KMeans(k=2, featuresCol='scaled_features')

m = km.fit(df2)

m.clusterCenters()

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
        5.26676612]),
 array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
        3.28474   ])]

In [183]:
m.transform(df2).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



# Recommendation System in Spark

In [184]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName('Recommendation Engine').getOrCreate()

df = spark.read.csv('/kaggle/input/movie-recommendation/movielens_ratings.csv', inferSchema=True, header=True)

df.show()

23/03/22 13:49:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [185]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [186]:
train, test = df.randomSplit([0.8, 0.2])

als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

model = als.fit(train)

predictions = model.transform(test)

predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|      0|   1.0|    26|  1.8684434|
|      1|   1.0|    26|   2.434731|
|      4|   1.0|    12|  2.6393816|
|      5|   2.0|    22| 0.88908935|
|      4|   2.0|     1|  2.9217875|
|      0|   1.0|    20|  1.6813176|
|      2|   2.0|    20|  0.7120066|
|      1|   1.0|     5| 0.28369877|
|      5|   1.0|     5| 0.43618703|
|      0|   1.0|    19| 0.24309614|
|      1|   4.0|    15| 0.95611584|
|      2|   1.0|    17|  3.8301725|
|      6|   1.0|     9| 0.87634623|
|      2|   4.0|     8|  2.5052433|
|      4|   2.0|     8|  1.8031679|
|      4|   1.0|    23|-0.03727299|
|      7|   1.0|    10| 0.41790688|
|      6|   2.0|    11|  1.4597461|
|      4|   1.0|    14|   0.271206|
|      6|   1.0|     2|-0.90608966|
+-------+------+------+-----------+
only showing top 20 rows



In [187]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(rmse)

1.7497589035931866


In [188]:
single_user = test.filter(test['userId'] == 11).select(['movieId', 'userId'])

single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      6|    11|
|     11|    11|
|     23|    11|
|     32|    11|
|     41|    11|
|     45|    11|
|     61|    11|
|     66|    11|
|     69|    11|
|     70|    11|
|     71|    11|
|     75|    11|
|     81|    11|
|     89|    11|
|     97|    11|
|     99|    11|
+-------+------+



In [189]:
recommendations = model.transform(single_user)

recommendations.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     23|    11|  4.3866963|
|     32|    11|  4.1463685|
|     71|    11|  3.2859325|
|     75|    11|  2.9265513|
|     61|    11|  2.8999562|
|     89|    11|   2.492686|
|     45|    11|  1.6563156|
|     97|    11|  1.6327345|
|      6|    11|  1.4597461|
|     81|    11|  1.0329089|
|     41|    11|  0.9547563|
|     66|    11| 0.77656454|
|     11|    11|  0.7721311|
|     99|    11| 0.66847277|
|     69|    11|-0.59268486|
|     70|    11| -2.9860172|
+-------+------+-----------+

