In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
    .getOrCreate()

In [11]:
df = spark.read.csv("us-cities-demographics.csv", sep=';', header=True, inferSchema=True)

In [12]:
df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Median Age: double (nullable = true)
 |-- Male Population: integer (nullable = true)
 |-- Female Population: integer (nullable = true)
 |-- Total Population: integer (nullable = true)
 |-- Number of Veterans: integer (nullable = true)
 |-- Foreign-born: integer (nullable = true)
 |-- Average Household Size: double (nullable = true)
 |-- State Code: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Count: integer (nullable = true)



In [13]:
df.show(10)

+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+-----+
|            City|         State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|                Race|Count|
+----------------+--------------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+-----+
|   Silver Spring|      Maryland|      33.8|          40601|            41862|           82463|              1562|       30908|                   2.6|        MD|  Hispanic or Latino|25924|
|          Quincy| Massachusetts|      41.0|          44129|            49500|           93629|              4147|       32935|                  2.39|        MA|               White|58723|
|          Hoover|       Alabama|      38.5|          3

In [6]:
df.count()

2891

In [14]:
df.select("State Code").groupby("State Code").count().orderBy("count", ascending=False).show(10)

+----------+-----+
|State Code|count|
+----------+-----+
|        CA|  676|
|        TX|  273|
|        FL|  222|
|        IL|   91|
|        WA|   85|
|        AZ|   80|
|        CO|   80|
|        MI|   79|
|        VA|   70|
|        NC|   70|
+----------+-----+
only showing top 10 rows



In [15]:
df.orderBy("City", "State").show(10)

+-------+-----+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|   City|State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|                Race| Count|
+-------+-----+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|Abilene|Texas|      31.3|          65212|            60664|          125876|              9367|        8129|                  2.64|        TX|               White| 95487|
|Abilene|Texas|      31.3|          65212|            60664|          125876|              9367|        8129|                  2.64|        TX|American Indian a...|  1813|
|Abilene|Texas|      31.3|          65212|            60664|          125876|              9367|        8129|                  2.64|        

In [45]:
from pyspark.sql.functions import col

In [47]:
df_city = df.select("City", "State", col("State Code").alias("State_Code"))
df_city.show(10)

+----------------+--------------+----------+
|            City|         State|State_Code|
+----------------+--------------+----------+
|   Silver Spring|      Maryland|        MD|
|          Quincy| Massachusetts|        MA|
|          Hoover|       Alabama|        AL|
|Rancho Cucamonga|    California|        CA|
|          Newark|    New Jersey|        NJ|
|          Peoria|      Illinois|        IL|
|        Avondale|       Arizona|        AZ|
|     West Covina|    California|        CA|
|        O'Fallon|      Missouri|        MO|
|      High Point|North Carolina|        NC|
+----------------+--------------+----------+
only showing top 10 rows



In [48]:
df_age = df.select("City", col("Median Age").alias("Median_age"))
df_age.show(10)

+----------------+----------+
|            City|Median_age|
+----------------+----------+
|   Silver Spring|      33.8|
|          Quincy|      41.0|
|          Hoover|      38.5|
|Rancho Cucamonga|      34.5|
|          Newark|      34.6|
|          Peoria|      33.1|
|        Avondale|      29.1|
|     West Covina|      39.8|
|        O'Fallon|      36.0|
|      High Point|      35.5|
+----------------+----------+
only showing top 10 rows



In [49]:
df_population = df.select("City", col("Male Population").alias("Male"), col("Female population").alias("Female"), col("total population").alias("total"))
df_population.show(10)

+----------------+------+------+------+
|            City|  Male|Female| total|
+----------------+------+------+------+
|   Silver Spring| 40601| 41862| 82463|
|          Quincy| 44129| 49500| 93629|
|          Hoover| 38040| 46799| 84839|
|Rancho Cucamonga| 88127| 87105|175232|
|          Newark|138040|143873|281913|
|          Peoria| 56229| 62432|118661|
|        Avondale| 38712| 41971| 80683|
|     West Covina| 51629| 56860|108489|
|        O'Fallon| 41762| 43270| 85032|
|      High Point| 51751| 58077|109828|
+----------------+------+------+------+
only showing top 10 rows



In [50]:
df_foreign = df.select("City", col("Foreign-born").alias("Foreign_Born"))
df_foreign.show(10)

+----------------+------------+
|            City|Foreign_Born|
+----------------+------------+
|   Silver Spring|       30908|
|          Quincy|       32935|
|          Hoover|        8229|
|Rancho Cucamonga|       33878|
|          Newark|       86253|
|          Peoria|        7517|
|        Avondale|        8355|
|     West Covina|       37038|
|        O'Fallon|        3269|
|      High Point|       16315|
+----------------+------------+
only showing top 10 rows



In [39]:
df_race = df.select("City", "Race", "Count")
df_race.show(10)

+----------------+--------------------+-----+
|            City|                Race|Count|
+----------------+--------------------+-----+
|   Silver Spring|  Hispanic or Latino|25924|
|          Quincy|               White|58723|
|          Hoover|               Asian| 4759|
|Rancho Cucamonga|Black or African-...|24437|
|          Newark|               White|76402|
|          Peoria|American Indian a...| 1343|
|        Avondale|Black or African-...|11592|
|     West Covina|               Asian|32716|
|        O'Fallon|  Hispanic or Latino| 2583|
|      High Point|               Asian|11060|
+----------------+--------------------+-----+
only showing top 10 rows



In [51]:
df_city.write.parquet("city", mode="overwrite")

In [52]:
df_age.write.parquet("age", mode="overwrite")

In [54]:
df_population.write.parquet("population", mode="overwrite")

In [55]:
df_foreign.write.parquet("population", mode="overwrite")

In [56]:
df_race.write.parquet("population", mode="overwrite")