In [60]:
import os
from datetime import datetime

from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, lit, col
from pyspark.sql import functions as F

# SparkSession is the entrypoint into the pyspark api.
spark = SparkSession.builder.appName('spark_walkthrough').getOrCreate()

# spark = SparkSession.builder \
#         .master("local[*]") \   # this is for if youre on a cluster 
#         .appName('PySpark_Tutorial') \
#         .getOrCreate()          # this actually creates the sparksession

In [12]:
df = spark.read.csv('cars.csv', header = True, sep = ';')
df_parquet = spark.read.parquet('nba_tweets.parquet')    # generic read csv, read json, read parquet format

In [14]:
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: string (nullable = true)
 |-- Cylinders: string (nullable = true)
 |-- Displacement: string (nullable = true)
 |-- Horsepower: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- Acceleration: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Origin: string (nullable = true)



In [26]:
data_schema = [
               StructField('Car', StringType(), True),
               StructField('MPG', DoubleType(), True),
               StructField('Cylinders', IntegerType(), True),
               StructField('Displacement', DoubleType(), True),
               StructField('Horsepower', DoubleType(), True),
               StructField('Weight', DoubleType(), True),
               StructField('Acceleration', DoubleType(), True),
               StructField('Model', StringType(), True),
               StructField('Origin', StringType(), True),
            ]
final_struc = StructType(fields = data_schema)

data = spark.read.csv(
    'cars.csv',
    sep = ';',
    header = True,
    schema = final_struc 
    )

In [27]:
data.head(5)  # to see top 5 rows

[Row(Car='Chevrolet Chevelle Malibu', MPG=18.0, Cylinders=8, Displacement=307.0, Horsepower=130.0, Weight=3504.0, Acceleration=12.0, Model='70', Origin='US'),
 Row(Car='Buick Skylark 320', MPG=15.0, Cylinders=8, Displacement=350.0, Horsepower=165.0, Weight=3693.0, Acceleration=11.5, Model='70', Origin='US'),
 Row(Car='Plymouth Satellite', MPG=18.0, Cylinders=8, Displacement=318.0, Horsepower=150.0, Weight=3436.0, Acceleration=11.0, Model='70', Origin='US'),
 Row(Car='AMC Rebel SST', MPG=16.0, Cylinders=8, Displacement=304.0, Horsepower=150.0, Weight=3433.0, Acceleration=12.0, Model='70', Origin='US'),
 Row(Car='Ford Torino', MPG=17.0, Cylinders=8, Displacement=302.0, Horsepower=140.0, Weight=3449.0, Acceleration=10.5, Model='70', Origin='US')]

In [28]:
data.schema # to see the structtype list we just passed in

StructType(List(StructField(Car,StringType,true),StructField(MPG,DoubleType,true),StructField(Cylinders,IntegerType,true),StructField(Displacement,DoubleType,true),StructField(Horsepower,DoubleType,true),StructField(Weight,DoubleType,true),StructField(Acceleration,DoubleType,true),StructField(Model,StringType,true),StructField(Origin,StringType,true)))

In [29]:
data.dtypes # returns list of tuples

[('Car', 'string'),
 ('MPG', 'double'),
 ('Cylinders', 'int'),
 ('Displacement', 'double'),
 ('Horsepower', 'double'),
 ('Weight', 'double'),
 ('Acceleration', 'double'),
 ('Model', 'string'),
 ('Origin', 'string')]

In [30]:
df.show(5)   # retunrs top 5 rows like head but in an easier-to-read format

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0| 3449.|        10.5|   70|    US|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows



In [31]:
data.describe().show()  # summary statistics

+-------+--------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------+
|summary|                 Car|               MPG|        Cylinders|      Displacement|       Horsepower|            Weight|      Acceleration|             Model|Origin|
+-------+--------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------+
|  count|                 406|               406|              406|               406|              406|               406|               406|               406|   406|
|   mean|                null|23.051231527093602|5.475369458128079| 194.7795566502463|103.5295566502463|2979.4137931034484|15.519704433497521| 75.92118226600985|  null|
| stddev|                null|   8.4017773522706|1.712159631548529|104.92245837948867|40.52065912106347| 847.0043282393513|2.8033588163425462|3.74873734545

In [32]:
data.columns


['Car',
 'MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model',
 'Origin']

In [33]:
data.count()  # same as len(data) in pandas


406

In [45]:
# from pyspark.sql.functions import current_date


today = datetime.now().date()
data = data.withColumn('date', lit(today)) # have to import lit and pass it in, without it adding the new column will fail
data.show(5)

# data = data.withColumn("date", current_date()) # this also exists  
# data.dtypes #  date is actually a date

+--------------------+----+---------+------------+----------+------+------------+-----+------+----------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|      date|
+--------------------+----+---------+------------+----------+------+------------+-----+------+----------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|2022-02-27|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|2022-02-27|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|2022-02-27|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        12.0|   70|    US|2022-02-27|
|         Ford Torino|17.0|        8|       302.0|     140.0|3449.0|        10.5|   70|    US|2022-02-27|
+--------------------+----+---------+------------+----------+------+------------+-----+------+----------+
only showing top 5 rows



[('Car', 'string'),
 ('MPG', 'double'),
 ('Cylinders', 'int'),
 ('Displacement', 'double'),
 ('Horsepower', 'double'),
 ('Weight', 'double'),
 ('Acceleration', 'double'),
 ('Model', 'string'),
 ('Origin', 'string'),
 ('date', 'date')]

In [43]:
data.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+----+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|date|
+--------------------+----+---------+------------+----------+------+------------+-----+------+----+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|   5|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|   5|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|   5|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        12.0|   70|    US|   5|
|         Ford Torino|17.0|        8|       302.0|     140.0|3449.0|        10.5|   70|    US|   5|
+--------------------+----+---------+------------+----------+------+------------+-----+------+----+
only showing top 5 rows



In [47]:
data = data.withColumnRenamed('date', 'data_changed') # rename the column
data.show(5)

+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|data_changed|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|  2022-02-27|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|  2022-02-27|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|  2022-02-27|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        12.0|   70|    US|  2022-02-27|
|         Ford Torino|17.0|        8|       302.0|     140.0|3449.0|        10.5|   70|    US|  2022-02-27|
+--------------------+----+---------+------------+----------+------+------------+-----+------+------------+
only showing top 5 rows



In [50]:
data = data.drop('data_changed')  # delete a column
data.show(25)

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|3504.0|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|3693.0|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|3436.0|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|3433.0|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0|3449.0|        10.5|   70|    US|
|    Ford Galaxie 500|15.0|        8|       429.0|     198.0|4341.0|        10.0|   70|    US|
|    Chevrolet Impala|14.0|        8|       454.0|     220.0|4354.0|         9.0|   70|    US|
|   Plymouth Fury iii|14.0|        8|       440.0|

In [None]:
data.na.drop() # drop na values
data.na.fill(data.select(F.mean(data['open'])).collect()[0][0])  # replace na with mean

In [58]:
# df2 = data.select('MPG').show(5) # this returns nothing in df2
df2 = data.select('MPG')
df2.show(5)

+----+
| MPG|
+----+
|18.0|
|15.0|
|18.0|
|16.0|
|17.0|
+----+
only showing top 5 rows

+----+
| MPG|
+----+
|18.0|
|15.0|
|18.0|
|16.0|
|17.0|
+----+
only showing top 5 rows



In [59]:
data.select(['Car', 'MPG', 'Horsepower']).show(5)

+--------------------+----+----------+
|                 Car| MPG|Horsepower|
+--------------------+----+----------+
|Chevrolet Chevell...|18.0|     130.0|
|   Buick Skylark 320|15.0|     165.0|
|  Plymouth Satellite|18.0|     150.0|
|       AMC Rebel SST|16.0|     150.0|
|         Ford Torino|17.0|     140.0|
+--------------------+----+----------+
only showing top 5 rows



In [66]:
data.filter(col('Origin') == 'Europe').show(5)
data.filter((col('Origin') != 'Europe') & (col('Horsepower') >= 115.0)).show(5) # have to wrap them in parantheses


+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Citroen DS-21 Pallas| 0.0|        4|       133.0|     115.0|3090.0|        17.5|   70|Europe|
|Volkswagen 1131 D...|26.0|        4|        97.0|      46.0|1835.0|        20.5|   70|Europe|
|         Peugeot 504|25.0|        4|       110.0|      87.0|2672.0|        17.5|   70|Europe|
|         Audi 100 LS|24.0|        4|       107.0|      90.0|2430.0|        14.5|   70|Europe|
|            Saab 99e|25.0|        4|       104.0|      95.0|2375.0|        17.5|   70|Europe|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
only showing top 5 rows

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MP

In [68]:
data.filter(data.Horsepower.between(175.0, 215.0)).show() # this formatting is fuckign stupid

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|    Ford Galaxie 500|15.0|        8|       429.0|     198.0|4341.0|        10.0|   70|    US|
|   Plymouth Fury iii|14.0|        8|       440.0|     215.0|4312.0|         8.5|   70|    US|
|  AMC Ambassador DPL|15.0|        8|       390.0|     190.0|3850.0|         8.5|   70|    US|
|Plymouth Satellit...| 0.0|        8|       383.0|     175.0|4166.0|        10.5|   70|    US|
|  AMC Rebel SST (sw)| 0.0|        8|       360.0|     175.0|3850.0|        11.0|   70|    US|
|           Ford F250|10.0|        8|       360.0|     215.0|4615.0|        14.0|   70|    US|
|           Chevy C20|10.0|        8|       307.0|     200.0|4376.0|        15.0|   70|    US|
|          Dodge D200|11.0|        8|       318.0|

In [76]:
# basic if else - not sure how to do a proper case when yet.

data.select('Horsepower', 
            F.when(data.Horsepower >= 200.0, 'YEET').otherwise('OOOF')
           .alias('my_func')).show(25)

data.select('Horsepower', 
            F.when(data.Horsepower >= 200.0, 1).otherwise(0).alias('is_fast_car')
           ).show(25)

+----------+-------+
|Horsepower|my_func|
+----------+-------+
|     130.0|   OOOF|
|     165.0|   OOOF|
|     150.0|   OOOF|
|     150.0|   OOOF|
|     140.0|   OOOF|
|     198.0|   OOOF|
|     220.0|   YEET|
|     215.0|   YEET|
|     225.0|   YEET|
|     190.0|   OOOF|
|     115.0|   OOOF|
|     165.0|   OOOF|
|     153.0|   OOOF|
|     175.0|   OOOF|
|     175.0|   OOOF|
|     170.0|   OOOF|
|     160.0|   OOOF|
|     140.0|   OOOF|
|     150.0|   OOOF|
|     225.0|   YEET|
|      95.0|   OOOF|
|      95.0|   OOOF|
|      97.0|   OOOF|
|      85.0|   OOOF|
|      88.0|   OOOF|
+----------+-------+
only showing top 25 rows

+----------+-----------+
|Horsepower|is_fast_car|
+----------+-----------+
|     130.0|          0|
|     165.0|          0|
|     150.0|          0|
|     150.0|          0|
|     140.0|          0|
|     198.0|          0|
|     220.0|          1|
|     215.0|          1|
|     225.0|          1|
|     190.0|          0|
|     115.0|          0|
|     165.0|   