# Starting a PySpark Session

- PySpark requires a SparkSession to interact with Spark

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkLearning").getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x1081d0810>


# Loading a CSV File into a DataFrame

- `header=True` means the first row contains column names
- `inferSchema = True` allows Spark to detect data types

In [11]:
csv_file_path = './resources/3_zipcodes.csv'
spark = SparkSession.builder.appName("PySparkLearning").getOrCreate()
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  50000000|       Urban area|
|           2|  90001|   STANDARD|Los Angele

# Basic DataFrame Operations

In [28]:



from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkLearning").getOrCreate()
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

df.printSchema() # Show Schema
df.select("City", "State").show(5) # Select column
df.filter(df["City"] == 'Chicago').show() # Applying filter
df.groupBy("City").count().show() # Group and count


root
 |-- RecordNumber: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Xaxis: integer (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- TaxReturnsFiled: integer (nullable = true)
 |-- EstimatedPopulation: integer (nullable = true)
 |-- TotalWages: integer (nullable = true)
 |-- Notes: string (nullable = true)

+-----------+-----+
|       City|State|
+-----------+-----+
|   New York|   NY|
|Los Angeles|   CA|
|    Chicago|   IL|
|    Houston|   TX|
|    Phoenix|   AZ|
+-----------+-

# Writing Data to a File

In [29]:
df.write.parquet('./output/7_StartPySpark_Output.parquet')

# Reading Back the Saved Parquet file

In [30]:
df_parquet = spark.read.parquet('./output/7_StartPySpark_Output.parquet')
df_parquet.show(5)

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  50000000|       Urban area|
|           2|  90001|   STANDARD|Los Angele

# Performing Data Transformations (ETL-like Processing)

### Removing columns

In [31]:
df_cleaned = df_parquet.dropna() #Remove rows with NULLs
df_cleaned.show()

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  50000000|       Urban area|
|           2|  90001|   STANDARD|Los Angele

### Fill missing values

In [34]:
df_filled = df.fillna({"City" : "UNKNOWN"})
df_filled.show(5)

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  50000000|       Urban area|
|           2|  90001|   STANDARD|Los Angele

### Adding a new column

In [39]:
from pyspark.sql.functions import lit #literal
df_new = df.withColumn("CountryFullName", lit("USA"))
df_new.show()

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+---------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|CountryFullName|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+---------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  50000000|       Urban ar

### Modifying a Column

In [53]:
from pyspark.sql.functions import col
df_updated = df.withColumn("TotalWages in Thousand", col("TotalWages")/1000)
df_updated.show()

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+----------------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|TotalWages in Thousand|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+----------------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  500

### Aggregate and Summarize Data

There are couple of options:
- Using `groupBy` in PySpark
- Using `pyspark.sql.functions`

In [54]:
df_updated.groupBy("City").agg({"TotalWages": "avg"}).show()

# OR

from pyspark.sql.functions import avg, sum, count
df_updated.groupBy("City").agg(
    avg("TotalWages").alias("avg_wage"),
    sum("TotalWages").alias("sum_wage"),
    count("TotalWages").alias("count_wage")
).show()

+-----------+---------------+
|       City|avg(TotalWages)|
+-----------+---------------+
|    Phoenix|          5.8E7|
|Los Angeles|          7.5E7|
|    Chicago|          6.2E7|
|    Houston|          6.9E7|
|   New York|          5.0E7|
+-----------+---------------+

+-----------+--------+--------+----------+
|       City|avg_wage|sum_wage|count_wage|
+-----------+--------+--------+----------+
|    Phoenix|   5.8E7|58000000|         1|
|Los Angeles|   7.5E7|75000000|         1|
|    Chicago|   6.2E7|62000000|         1|
|    Houston|   6.9E7|69000000|         1|
|   New York|   5.0E7|50000000|         1|
+-----------+--------+--------+----------+



### Filter and Query Data (SparkSQL Vs PySpark API)

In [56]:
df_filter = df_updated.filter(df["EstimatedPopulation"] > 220000)
df_filter.show()


+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+--------------+----------------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|         Notes|TotalWages in Thousand|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+--------------+----------------------+
|           2|  90001|   STANDARD|Los Angeles|   CA|     PRIMARY|34.0522|-118.2437|  124|457.79|790.02|         NA|     US|Los Angeles, CA|(34.0522,-118.2437)|        false|          60000|             380000|  75000000|Wes

In [74]:
# Using SparkSQL
df_updated.createOrReplaceTempView("StateDetails")
sql = spark.sql("select * from StateDetails where City='Los Angeles'")
sql.show()

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+--------------+----------------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|         Notes|TotalWages in Thousand|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+--------------+----------------------+
|           2|  90001|   STANDARD|Los Angeles|   CA|     PRIMARY|34.0522|-118.2437|  124|457.79|790.02|         NA|     US|Los Angeles, CA|(34.0522,-118.2437)|        false|          60000|             380000|  75000000|Wes

# Joins and Combining Data

In [80]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkJoin').getOrCreate()

df1 = spark.read.csv('./resources/5_person.csv', header=True, inferSchema=True)
df2 = spark.read.csv('./resources/6_country.csv', header=True, inferSchema=True)

df_joined = df1.join(df2, df1["id"] == df2["id"], "inner")
df_joined.show()

+---+-------+---+---+---------+
| id|   name|age| id|  country|
+---+-------+---+---+---------+
|  1|  Alice| 25|  1|      USA|
|  2|    Bob| 30|  2|   Canada|
|  3|Charlie| 35|  3|       UK|
|  4|  David| 40|  4|Australia|
|  5|    Eva| 45|  5|  Germany|
+---+-------+---+---+---------+

