# Starting a PySpark Session

- PySpark requires a SparkSession to interact with Spark

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkLearning").getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x1081d0810>


# Loading a CSV File into a DataFrame

- `header=True` means the first row contains column names
- `inferSchema = True` allows Spark to detect data types

In [11]:
csv_file_path = './resources/3_zipcodes.csv'
spark = SparkSession.builder.appName("PySparkLearning").getOrCreate()
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)

+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|RecordNumber|Zipcode|ZipCodeType|       City|State|LocationType|    Lat|     Long|Xaxis| Yaxis| Zaxis|WorldRegion|Country|   LocationText|           Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|            Notes|
+------------+-------+-----------+-----------+-----+------------+-------+---------+-----+------+------+-----------+-------+---------------+-------------------+-------------+---------------+-------------------+----------+-----------------+
|           1|  10001|   STANDARD|   New York|   NY|     PRIMARY|40.7128|  -74.006|  123|456.78|789.01|         NA|     US|   New York, NY| (40.7128,-74.0060)|        false|          50000|             211000|  50000000|       Urban area|
|           2|  90001|   STANDARD|Los Angele

# Basic DataFrame Operations

In [28]:



from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkLearning").getOrCreate()
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

df.printSchema() # Show Schema
df.select("City", "State").show(5) # Select column
df.filter(df["City"] == 'Chicago').show() # Applying filter
df.groupBy("City").count().show() # Group and count


root
 |-- RecordNumber: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Xaxis: integer (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- TaxReturnsFiled: integer (nullable = true)
 |-- EstimatedPopulation: integer (nullable = true)
 |-- TotalWages: integer (nullable = true)
 |-- Notes: string (nullable = true)

+-----------+-----+
|       City|State|
+-----------+-----+
|   New York|   NY|
|Los Angeles|   CA|
|    Chicago|   IL|
|    Houston|   TX|
|    Phoenix|   AZ|
+-----------+-