In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=f3829ace7c72d74e04930f538177afbc9994cfab71efe01c5fc59517abf687bf
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
from pyspark.sql import SparkSession

In [3]:
# create a spark session
spark = SparkSession.builder.appName('basics').getOrCreate()

In [4]:
%%writefile user_simple.json
{"name":"Bob"}
{"name":"Jim", "age":40}
{"name":"Mary", "age": 24}

Writing user_simple.json


In [6]:
df = spark.read.json("user_simple.json")

In [7]:
df.show()

+----+----+
| age|name|
+----+----+
|null| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
df.columns

['age', 'name']

In [10]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [11]:
# designing a custom table schema
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [12]:
data_schema = [StructField('age', IntegerType(), True), StructField('name', StringType(), True)]

In [13]:
final_struc = StructType(fields = data_schema)

In [14]:
df = spark.read.json('user_simple.json', schema = final_struc)

In [15]:
df.show()

+----+----+
| age|name|
+----+----+
|null| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



In [16]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [18]:
df['age']

Column<'age'>

In [19]:
type(df['age'])

pyspark.sql.column.Column

In [21]:
# to extract a particular col from the dataframe
df.select('age').show()

+----+
| age|
+----+
|null|
|  40|
|  24|
+----+



In [24]:
# selecting multiple col from the dataframe
df.select(['name','age']).show()

+----+----+
|name| age|
+----+----+
| Bob|null|
| Jim|  40|
|Mary|  24|
+----+----+



In [25]:
# creating a new col for the dataframe
df.withColumn('new_age', df['age']*1.5).show()

+----+----+-------+
| age|name|new_age|
+----+----+-------+
|null| Bob|   null|
|  40| Jim|   60.0|
|  24|Mary|   36.0|
+----+----+-------+



In [26]:
# renaming the col

df.withColumnRenamed('name','first_name').show()

+----+----------+
| age|first_name|
+----+----------+
|null|       Bob|
|  40|       Jim|
|  24|      Mary|
+----+----------+



In [27]:
df.withColumn('age_less', df['age']-5).show()

+----+----+--------+
| age|name|age_less|
+----+----+--------+
|null| Bob|    null|
|  40| Jim|      35|
|  24|Mary|      19|
+----+----+--------+



In [36]:
df.createOrReplaceTempView('customers')

In [37]:
sql_results = spark.sql("SELECT * FROM customers")

In [38]:
sql_results

DataFrame[age: int, name: string]

In [39]:
sql_results.show()

+----+----+
| age|name|
+----+----+
|null| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



In [41]:
spark.sql("SELECT * FROM customers WHERE age = 24").show()

+---+----+
|age|name|
+---+----+
| 24|Mary|
+---+----+

