In [0]:
df = spark.read.csv('/Workspace/Users/harsh.reactdev@gmail.com/PySpark-databricks/data bricks/flights-larger.csv', header=True, inferSchema=True)

display(df)

In [0]:
df.printSchema()

In [0]:
description = df.describe()
display(description)

In [0]:
display(df.summary())

In [0]:
df.show(5, truncate=True)

In [0]:
df.count()

In [0]:
display(df)

In [0]:
collect20 = df.limit(20).collect()
display(collect20)

In [0]:
# select
selects = df.select('mon', 'dow')
display(selects)

In [0]:
from pyspark.sql.functions import col

selectsCond = df.select(col('mon') * 5)
display(selectsCond)

In [0]:
selectExpr = df.selectExpr('mon * 10 as new_mon') # takes in sql expressions as arguments and executes it
display(selectExpr)

In [0]:
withCol = df.withColumn('dowMon', col('mon') + col('dow'))
display(withCol)

In [0]:
renameCol = df.withColumnRenamed('mon', 'mon->tue')
display(renameCol)

In [0]:
allOfEm = df \
    .select('mon', 'dow')\
    .withColumn('mon_new', col('mon') * 10)\
    .withColumnRenamed('mon_new', 'new_mon')\
    .drop('mon')

display(allOfEm)

In [0]:
from pyspark.sql.functions import col, lit, expr

literal = df.withColumn('country_code', lit('IN'))
display(literal.head(5))

expression = df.withColumn('dist', expr('duration * 9'))
display(expression.head(5))

In [0]:
filtered = df.filter(col('mon') > 10)
display(filtered.take(5))

filteredSql = df.filter('dom > 30 AND org == "SFO"')
display(filteredSql.take(5))

In [0]:
from pyspark.sql.functions import when, col, expr
ifelse = df.withColumn(
    'comfy',
    when(col('mile') < 1000, 'short flight')
    .when(col('mile') < 4000, 'mild')
    .otherwise('long flight')
)

display(ifelse.take(5))

In [0]:
casewhen = df.withColumn(
    'comfy',
    expr('''
        CASE
            when duration > 360 then 'Long'
            when duration > 600 then 'Looong'
            else 'Doable'
        END
    ''')
)

display(casewhen.take(5))

In [0]:
from pyspark.sql.functions import concat, col
concatenated = df.withColumn(
    'flightCode',
    concat(col('carrier'), col('flight'))
)

display(concatenated.take(5))

In [0]:
from pyspark.sql.functions import current_date, current_timestamp, to_date, col
timed = df.withColumn(
    'timestamp',
    current_timestamp()
)
display(timed.take(5))

converted = timed.withColumn(
    'date',
    to_date(col('timestamp'), 'mm/dd/yy')
)
display(converted.take(5))

dated = df.withColumn(
    'date',
    current_date()
)
display(dated.take(5))

In [0]:
from pyspark.sql.functions import array, array_contains, array_distinct, explode, arrays_zip

colArr = df.withColumn(
    'allFlights',
    array(col('mon'), col('dom'), col('dow'))
)

display(colArr.take(5))

arr_contain = df.withColumn(
    'doesContain10',
    array_contains(array(col('dom')), 10)
)
display(arr_contain.take(5))

expl = df.withColumn(
    'explode',
    explode(array(col('mon'), col('dom'), col('dow')))
)
display(expl.take(10))

zipped = df.withColumn(
    'zipped',
    arrays_zip(array(array(col('mon')), array(col('dow'))))
)
display(zipped.take(10))

In [0]:
from pyspark.sql.functions import struct, col, lit

# creating struct columns
structured = df.withColumn(
    'structured',
    struct(col('mon'), col('dom'), col('dow') )
)
display(structured.take(5))

display(structured.select(col('structured').mon).take(5)) # selects the property from the structured column using dot notation

# access using getField() method
display(structured
        .select(col('structured')
                .getField('dow')        
        )
        .take(5)
)

#add/update field using withField
display(
    structured.withColumn(
        'structured',
        col('structured').withField('wod', col('mon') + col('dom') + col('dow'))
    ).take(5)
)

#renaming a struct field by rebuilding the struct
display(
    structured
        .withColumn(
            'structured',
            struct(
                col('structured.mon').alias('AIRMON'),
                col('structured.dom'),
                col('structured.dow')
            )
    ).take(5)
)

In [0]:
from pyspark.sql.functions import col, create_map, lit, map_keys

mapped = df.withColumn(
    'Planes',
    create_map(
        lit('DOM'), col('dom'),
        lit('MON'), col('mon'),
        lit('DOW'), col('dow')
    )
)

display(mapped.take(5))

# map_keys()
display(mapped.withColumn(
    'plane-keys',
    map_keys(col('Planes'))
).take(5))

display(mapped.select(map_keys(col('Planes'))).take(5))

# 

In [0]:
from pyspark.sql.functions import collect_list, collect_set

grouped = df.groupBy(['mon', 'dow']).agg(
    collect_list('mon'),
    collect_set('mon')
)
display(grouped.take(5))

rolledup = df.rollup('mon', 'dow').count()
display(rolledup)

cubed = df.cube('mon', 'dow').count()
display(cubed)