In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import time


In [3]:
spark = SparkSession.builder.getOrCreate()


In [4]:
sc = SparkContext.getOrCreate()


In [7]:
statesDf = spark.read.csv(
    "./data/statesPopulation.csv", header=True, inferSchema=True, sep=","
)
statesDf.printSchema()


[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- State: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Population: integer (nullable = true)



                                                                                

In [8]:
statesDf.createOrReplaceTempView("states")


In [12]:
statesDf.sort("population", ascending=False).show(5)


+----------+----+----------+
|     State|Year|Population|
+----------+----+----------+
|California|2016|  39250017|
|California|2015|  38993940|
|California|2014|  38680810|
|California|2013|  38335203|
|California|2012|  38011074|
+----------+----+----------+
only showing top 5 rows



In [11]:
spark.sql("select * from states order by population desc limit 5").show()


+----------+----+----------+
|     State|Year|Population|
+----------+----+----------+
|California|2016|  39250017|
|California|2015|  38993940|
|California|2014|  38680810|
|California|2013|  38335203|
|California|2012|  38011074|
+----------+----+----------+



In [33]:
from pyspark.sql import Row
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
    DateType,
)


In [18]:
rdd = spark.sparkContext.parallelize(
    [Row(name="John", age=19), Row(name="Smith", age=29), Row(name="Adam", age=35)]
)
schema = StructType(
    [StructField("name", StringType(), False), StructField("age", IntegerType(), False)]
)
df = spark.createDataFrame(rdd, schema)
df.show()


+-----+---+
| name|age|
+-----+---+
| John| 19|
|Smith| 29|
| Adam| 35|
+-----+---+



In [19]:
rdd = spark.sparkContext.parallelize(
    [
        Row(first_name="reggie", stats=Row(runs=10, hits=20, errors=5)),
        Row(first_name="joe", stats=Row(runs=20, hits=30, errors=10)),
    ]
)
schema = StructType(
    [
        StructField("first_name", StringType(), False),
        StructField(
            "stats",
            StructType(
                [
                    StructField("runs", IntegerType(), False),
                    StructField("hits", IntegerType(), False),
                    StructField("errors", IntegerType(), False),
                ]
            ),
            False,
        ),
    ]
)
df = spark.createDataFrame(rdd, schema)
df.show()


+----------+------------+
|first_name|       stats|
+----------+------------+
|    reggie| {10, 20, 5}|
|       joe|{20, 30, 10}|
+----------+------------+



In [27]:
start = time.time()
statesDf = spark.read.csv(
    "./data/statesPopulation.csv", header=True, inferSchema=True, sep=","
)
stop = time.time()
print(f"Time taken to read csv file: {(stop - start) * 1000} ms")


Time taken to read csv file: 272.6104259490967 ms


In [40]:
start = time.time()
state_schema = StructType(
    [
        StructField("state", StringType(), False),
        StructField("year", IntegerType(), False),
        StructField("population", IntegerType(), False),
    ]
)
df_states = spark.read.csv(
    "./data/statesPopulation.csv", header=True, schema=state_schema
)
stop = time.time()
# print the time in milliseconds
print(f"Time taken to read the csv file: {(stop-start) * 1000} ms")
df_states.show(10)


Time taken to read the csv file: 42.0374870300293 ms
+--------------------+----+----------+
|               state|year|population|
+--------------------+----+----------+
|             Alabama|2010|   4785492|
|              Alaska|2010|    714031|
|             Arizona|2010|   6408312|
|            Arkansas|2010|   2921995|
|          California|2010|  37332685|
|            Colorado|2010|   5048644|
|            Delaware|2010|    899816|
|District of Columbia|2010|    605183|
|             Florida|2010|  18849098|
|             Georgia|2010|   9713521|
+--------------------+----+----------+
only showing top 10 rows



In [36]:
start = time.time()
movie_df = spark.read.csv(
    "./data/moviedata.csv", header=False, inferSchema=True, sep=","
)
stop = time.time()
print(f"Time taken to read csv file: {(stop - start) * 1000} ms")
movie_df.show(10)


Time taken to read csv file: 299.5157241821289 ms
+---+--------------------+----+---+----+
|_c0|                 _c1| _c2|_c3| _c4|
+---+--------------------+----+---+----+
|  1|The Nightmare Bef...|1993|3.9|4568|
|  2|           The Mummy|1932|3.5|4388|
|  3|Orphans of the Storm|1921|3.2|9062|
|  4|The Object of Beauty|1991|2.8|6150|
|  5|          Night Tide|1963|2.8|5126|
|  6| One Magic Christmas|1985|3.8|5333|
|  7|    Muriel's Wedding|1994|3.5|6323|
|  8|       Mother's Boys|1994|3.4|5733|
|  9|Nosferatu: Origin...|1929|3.5|5651|
| 10|        Nick of Time|1995|3.4|5333|
+---+--------------------+----+---+----+
only showing top 10 rows



In [38]:
start = time.time()
schema = StructType(
    [
        StructField("movieId", IntegerType(), False),
        StructField("title", StringType(), False),
        StructField("year", IntegerType(), False),
        StructField("rating", DoubleType(), False),
        StructField("popularity", IntegerType(), False),
    ]
)
movie_df = spark.read.csv("./data/moviedata.csv", header=True, schema=schema)
stop = time.time()
print(f"Time taken to read csv file: {(stop - start) * 1000} ms")
movie_df.show(10)


Time taken to read csv file: 20.772695541381836 ms
+-------+--------------------+----+------+----------+
|movieId|               title|year|rating|popularity|
+-------+--------------------+----+------+----------+
|      2|           The Mummy|1932|   3.5|      4388|
|      3|Orphans of the Storm|1921|   3.2|      9062|
|      4|The Object of Beauty|1991|   2.8|      6150|
|      5|          Night Tide|1963|   2.8|      5126|
|      6| One Magic Christmas|1985|   3.8|      5333|
|      7|    Muriel's Wedding|1994|   3.5|      6323|
|      8|       Mother's Boys|1994|   3.4|      5733|
|      9|Nosferatu: Origin...|1929|   3.5|      5651|
|     10|        Nick of Time|1995|   3.4|      5333|
|     11|     Broken Blossoms|1919|   3.3|      5367|
+-------+--------------------+----+------+----------+
only showing top 10 rows



In [41]:
df_states.createOrReplaceTempView("states")
df_states.sort("population", ascending=False).show(5)
spark.sql("select * from states order by population desc limit 5").show()


+----------+----+----------+
|     state|year|population|
+----------+----+----------+
|California|2016|  39250017|
|California|2015|  38993940|
|California|2014|  38680810|
|California|2013|  38335203|
|California|2012|  38011074|
+----------+----+----------+
only showing top 5 rows

+----------+----+----------+
|     state|year|population|
+----------+----+----------+
|California|2016|  39250017|
|California|2015|  38993940|
|California|2014|  38680810|
|California|2013|  38335203|
|California|2012|  38011074|
+----------+----+----------+



In [44]:
df_states.groupBy("state").sum("population").show(5)
spark.sql("select state, sum(population) from states group by state").show(5)


+---------+---------------+
|    state|sum(population)|
+---------+---------------+
|     Utah|       20333580|
|   Hawaii|        9810173|
|Minnesota|       37914011|
|     Ohio|       81020539|
| Arkansas|       20703849|
+---------+---------------+
only showing top 5 rows

+---------+---------------+
|    state|sum(population)|
+---------+---------------+
|     Utah|       20333580|
|   Hawaii|        9810173|
|Minnesota|       37914011|
|     Ohio|       81020539|
| Arkansas|       20703849|
+---------+---------------+
only showing top 5 rows



In [51]:
df_states.groupBy("state").agg({"population": "sum"}).withColumnRenamed(
    "sum(population)", "Total"
).show(5)
spark.sql("select state, sum(population) as Total from states group by state").show(5)


+---------+--------+
|    state|   Total|
+---------+--------+
|     Utah|20333580|
|   Hawaii| 9810173|
|Minnesota|37914011|
|     Ohio|81020539|
| Arkansas|20703849|
+---------+--------+
only showing top 5 rows

+---------+--------+
|    state|   Total|
+---------+--------+
|     Utah|20333580|
|   Hawaii| 9810173|
|Minnesota|37914011|
|     Ohio|81020539|
| Arkansas|20703849|
+---------+--------+
only showing top 5 rows



In [48]:
df_states.groupBy("state").agg({"population": "avg"}).withColumnRenamed(
    "avg(population)", "moy"
).show(5)
spark.sql("select state, avg(population) as moy from states group by state").show(5)


+---------+--------------------+
|    state|                 moy|
+---------+--------------------+
|     Utah|  2904797.1428571427|
|   Hawaii|  1401453.2857142857|
|Minnesota|   5416287.285714285|
|     Ohio|1.1574362714285715E7|
| Arkansas|   2957692.714285714|
+---------+--------------------+
only showing top 5 rows

+---------+--------------------+
|    state|                 moy|
+---------+--------------------+
|     Utah|  2904797.1428571427|
|   Hawaii|  1401453.2857142857|
|Minnesota|   5416287.285714285|
|     Ohio|1.1574362714285715E7|
| Arkansas|   2957692.714285714|
+---------+--------------------+
only showing top 5 rows



In [57]:
df_states.groupBy("state").agg(
    {"population": "sum", "population": "avg", "population": "min", "population": "max"}
).withColumnRenamed("sum(population)", "Total").withColumnRenamed(
    "avg(population)", "moy"
).withColumnRenamed(
    "min(population)", "min"
).withColumnRenamed(
    "max(population)", "max"
).show(
    5
)
spark.sql(
    "select state, sum(population) as Total, avg(population) as moy, min(population) as min, max(population) as max from states group by state"
).show(5)


+---------+--------+
|    state|     max|
+---------+--------+
|     Utah| 3051217|
|   Hawaii| 1428557|
|Minnesota| 5519952|
|     Ohio|11614373|
| Arkansas| 2988248|
+---------+--------+
only showing top 5 rows

+---------+--------+--------------------+--------+--------+
|    state|   Total|                 moy|     min|     max|
+---------+--------+--------------------+--------+--------+
|     Utah|20333580|  2904797.1428571427| 2775326| 3051217|
|   Hawaii| 9810173|  1401453.2857142857| 1363945| 1428557|
|Minnesota|37914011|   5416287.285714285| 5311147| 5519952|
|     Ohio|81020539|1.1574362714285715E7|11540983|11614373|
| Arkansas|20703849|   2957692.714285714| 2921995| 2988248|
+---------+--------+--------------------+--------+--------+
only showing top 5 rows



In [49]:
df_states.groupBy("state").sum("population").withColumnRenamed(
    "sum(population)", "Total"
).sort("Total", ascending=False).show(5)
spark.sql(
    "select state, sum(population) as Total from states group by state order by Total desc"
).show(5)


+----------+---------+
|     state|    Total|
+----------+---------+
|California|268280590|
|     Texas|185672865|
|   Florida|137618322|
|  New York|137409471|
|  Illinois| 89960023|
+----------+---------+
only showing top 5 rows

+----------+---------+
|     state|    Total|
+----------+---------+
|California|268280590|
|     Texas|185672865|
|   Florida|137618322|
|  New York|137409471|
|  Illinois| 89960023|
+----------+---------+
only showing top 5 rows

