In [1]:
!pip install pyspark



In [2]:
# Start a Spark Session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Introduction to Spark").getOrCreate()

In [3]:
spark

# Connect to a jSON file in Spark

In [4]:
path='C:/Users/kojoo/Downloads/ApacheSparkforBigDataAnalytics/employees.json'
df=spark.read.json(path)

In [5]:
df

DataFrame[age: bigint, name: string]

In [7]:
df.show(100)  # Dataframe

+----+--------+
| age|    name|
+----+--------+
|null| Michael|
|  30|    Andy|
|  19|  Justin|
|null|    Jane|
|  10|     Joe|
|  29| Solomon|
|null|  neeraj|
|  30|  Sachin|
|  49|  Nitish|
|null|     Ama|
|  50|  Serwaa|
|  59|   James|
|null| Charles|
|  44|     Rox|
|  23|     Fox|
|null|     Moi|
|  45|     Fin|
|  29|    Mada|
|null|   Olive|
|  70|   Kehar|
|  69|   Lamar|
|null|Kendrick|
|  10|  Sister|
|  20| Brother|
+----+--------+



In [8]:
df.collect()  # Array--tuple

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin'),
 Row(age=None, name='Jane'),
 Row(age=10, name='Joe'),
 Row(age=29, name='Solomon'),
 Row(age=None, name='neeraj'),
 Row(age=30, name='Sachin'),
 Row(age=49, name='Nitish'),
 Row(age=None, name='Ama'),
 Row(age=50, name='Serwaa'),
 Row(age=59, name='James'),
 Row(age=None, name='Charles'),
 Row(age=44, name='Rox'),
 Row(age=23, name='Fox'),
 Row(age=None, name='Moi'),
 Row(age=45, name='Fin'),
 Row(age=29, name='Mada'),
 Row(age=None, name='Olive'),
 Row(age=70, name='Kehar'),
 Row(age=69, name='Lamar'),
 Row(age=None, name='Kendrick'),
 Row(age=10, name='Sister'),
 Row(age=20, name='Brother')]

# Print Schema 

In [9]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



# Check Columns

In [10]:
df.columns

['age', 'name']

# Descriptive Stats on DataFrame

In [11]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [12]:
df.describe().show()

+-------+------------------+------+
|summary|               age|  name|
+-------+------------------+------+
|  count|                16|    24|
|   mean|            36.625|  null|
| stddev|19.172462891692692|  null|
|    min|                10|   Ama|
|    max|                70|neeraj|
+-------+------------------+------+



# Change DataTypes in PySpark

In [None]:
StringType,IntergerType

In [13]:
# Schema Changes
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

In [14]:
data_schema=[StructField('age',IntegerType(),True),
             StructField('name',StringType(),True) ]


In [15]:
final_struc=StructType(fields=data_schema)

In [16]:
df=spark.read.json(path,schema=final_struc)
df.show(5)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
|null|   Jane|
|  10|    Joe|
+----+-------+
only showing top 5 rows



In [17]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



# Selecting Columns in Spark

In [18]:
type(df)

pyspark.sql.dataframe.DataFrame

In [20]:
df['age'].show()

TypeError: 'Column' object is not callable

In [22]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
|null|
|  10|
|  29|
|null|
|  30|
|  49|
|null|
|  50|
|  59|
|null|
|  44|
|  23|
|null|
|  45|
|  29|
|null|
|  70|
+----+
only showing top 20 rows



In [23]:
df.select(['age','name']).show(5)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
|null|   Jane|
|  10|    Joe|
+----+-------+
only showing top 5 rows



# Creating #New  Columns 

In [24]:
# create a calculated column called new age ( age * 2)
df=df.withColumn('newage',df['age']*2)

In [25]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- newage: integer (nullable = true)



In [26]:
df.show(5)

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
|null|   Jane|  null|
|  10|    Joe|    20|
+----+-------+------+
only showing top 5 rows



In [27]:
# Column Renamed
df=df.withColumnRenamed('age','age_renamed')
df.show()

+-----------+-------+------+
|age_renamed|   name|newage|
+-----------+-------+------+
|       null|Michael|  null|
|         30|   Andy|    60|
|         19| Justin|    38|
|       null|   Jane|  null|
|         10|    Joe|    20|
|         29|Solomon|    58|
|       null| neeraj|  null|
|         30| Sachin|    60|
|         49| Nitish|    98|
|       null|    Ama|  null|
|         50| Serwaa|   100|
|         59|  James|   118|
|       null|Charles|  null|
|         44|    Rox|    88|
|         23|    Fox|    46|
|       null|    Moi|  null|
|         45|    Fin|    90|
|         29|   Mada|    58|
|       null|  Olive|  null|
|         70|  Kehar|   140|
+-----------+-------+------+
only showing top 20 rows



# Create Materialized SQL Views 

In [28]:
df.createOrReplaceTempView('myview')

In [31]:
spark.sql('select * from myview').show()

+-----------+-------+------+
|age_renamed|   name|newage|
+-----------+-------+------+
|       null|Michael|  null|
|         30|   Andy|    60|
|         19| Justin|    38|
|       null|   Jane|  null|
|         10|    Joe|    20|
|         29|Solomon|    58|
|       null| neeraj|  null|
|         30| Sachin|    60|
|         49| Nitish|    98|
|       null|    Ama|  null|
|         50| Serwaa|   100|
|         59|  James|   118|
|       null|Charles|  null|
|         44|    Rox|    88|
|         23|    Fox|    46|
|       null|    Moi|  null|
|         45|    Fin|    90|
|         29|   Mada|    58|
|       null|  Olive|  null|
|         70|  Kehar|   140|
+-----------+-------+------+
only showing top 20 rows



In [32]:
df.createOrReplaceTempView('myview2')

In [34]:
df2=spark.sql('select * from myview union all select * from myview2')

In [37]:
df2.show(3)

+-----------+-------+------+
|age_renamed|   name|newage|
+-----------+-------+------+
|       null|Michael|  null|
|         30|   Andy|    60|
|         19| Justin|    38|
+-----------+-------+------+
only showing top 3 rows



In [38]:
spark

# Read CSV In Spark 

In [39]:
path1='C:/Users/kojoo/Downloads/ApacheSparkforBigDataAnalytics/appl_stock.csv'
df3=spark.read.csv(path1,header=True,inferSchema=True)
df3.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [41]:
df3.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [42]:
df3.select("Open").show(5)

+----------+
|      Open|
+----------+
|213.429998|
|214.599998|
|214.379993|
|    211.75|
|210.299994|
+----------+
only showing top 5 rows



In [43]:
df3.select(["Open","High","Low"]).show()

+------------------+------------------+------------------+
|              Open|              High|               Low|
+------------------+------------------+------------------+
|        213.429998|        214.499996|212.38000099999996|
|        214.599998|        215.589994|        213.249994|
|        214.379993|            215.23|        210.750004|
|            211.75|        212.000006|        209.050005|
|        210.299994|        212.000006|209.06000500000002|
|212.79999700000002|        213.000002|        208.450005|
|209.18999499999998|209.76999500000002|        206.419998|
|        207.870005|210.92999500000002|        204.099998|
|210.11000299999998|210.45999700000002|        209.020004|
|210.92999500000002|211.59999700000003|        205.869999|
|        208.330002|215.18999900000003|        207.240004|
|        214.910006|        215.549994|        209.500002|
|        212.079994|213.30999599999998|        207.210003|
|206.78000600000001|        207.499996|            197.1

 # Filtering 

In [45]:
df3.filter("Close > 210").show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [46]:
df3.filter("Close > 210").select(["Open",'Close',"Volume"]).show()

+------------------+------------------+---------+
|              Open|             Close|   Volume|
+------------------+------------------+---------+
|        213.429998|        214.009998|123432400|
|        214.599998|        214.379993|150476200|
|        214.379993|        210.969995|138040000|
|            211.75|            210.58|119282800|
|        210.299994|211.98000499999998|111902700|
|212.79999700000002|210.11000299999998|115557400|
|        207.870005|        210.650002|151473000|
|        208.330002|        215.039995|182501900|
|        214.910006|            211.73|153038200|
|        209.279997|210.71000299999997| 91510300|
|        214.940006|218.95000499999998|224905100|
|220.01000200000001|        219.079994|107472400|
|218.31000299999997|        223.020004|230064800|
|        223.829996|224.83999300000002|149054500|
|        223.909998|        225.500008|101425100|
|         227.37001|        226.600006|104080900|
|225.38000499999998|223.83999599999999|123375700|


# Filtering on Multiple Conditions

In [48]:
a=df3["Close"] > 200
b=df3["Close"] < 300

In [50]:
df3.filter((a) & (b)).show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [51]:
df3.filter((df3["Close"] > 200) & (df3["Close"] < 300)).show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [53]:
# Give us the date where Low is 212.38000099999996
df3.filter("Low==212.38000099999996").show()

+----------+----------+----------+------------------+----------+---------+---------+
|      Date|      Open|      High|               Low|     Close|   Volume|Adj Close|
+----------+----------+----------+------------------+----------+---------+---------+
|2010-01-04|213.429998|214.499996|212.38000099999996|214.009998|123432400|27.727039|
+----------+----------+----------+------------------+----------+---------+---------+



In [54]:
df3.filter("Low==212.38000099999996").select("Date").show()

+----------+
|      Date|
+----------+
|2010-01-04|
+----------+



# Collect 

In [56]:
df3.filter("Low==212.38000099999996").select("Date").collect()

[Row(Date=datetime.date(2010, 1, 4))]