# PySpark DataFrames

In [5]:
from pyspark.sql import SparkSession

In [41]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [7]:
spark

In [15]:
df_pyspark = spark.read.option('header','true').csv('data/sample_data.csv')

In [16]:
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|   Jijo| 25|         4|
|Adharsh| 25|         5|
| Sajjad| 23|         2|
+-------+---+----------+



In [17]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- experience: string (nullable = true)



The above code shows age and experince as strings instead of integers. we can overcome this by the following method

In [19]:
# read dataset with setting schema
df_pyspark = spark.read.option('header','true').csv('data/sample_data.csv', inferSchema=True)

In [20]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [22]:
# this method of reading dataset without option  
df_pyspark = spark.read.csv('data/sample_data.csv', header=True, inferSchema=True)

In [23]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [24]:
df_pyspark.columns

['name', 'age', 'experience']

In [25]:
df_pyspark.head(2)

[Row(name='Jijo', age=25, experience=4),
 Row(name='Adharsh', age=25, experience=5)]

In [29]:
df_pyspark.select('name')

DataFrame[name: string]

In [30]:
df_pyspark.select('name').show()

+-------+
|   name|
+-------+
|   Jijo|
|Adharsh|
| Sajjad|
+-------+



In [32]:
# for multiple columns pass column names as list(column names are case insensitive)
df_pyspark.select(['name','Age']).show()

+-------+---+
|   name|Age|
+-------+---+
|   Jijo| 25|
|Adharsh| 25|
| Sajjad| 23|
+-------+---+



In [34]:
df_pyspark.describe()

DataFrame[summary: string, name: string, age: string, experience: string]

In [35]:
df_pyspark.describe().show()

+-------+-------+------------------+------------------+
|summary|   name|               age|        experience|
+-------+-------+------------------+------------------+
|  count|      3|                 3|                 3|
|   mean|   null|24.333333333333332|3.6666666666666665|
| stddev|   null|1.1547005383792517|1.5275252316519465|
|    min|Adharsh|                23|                 2|
|    max| Sajjad|                25|                 5|
+-------+-------+------------------+------------------+



In [36]:
# adding columns to a df
df_pyspark = df_pyspark.withColumn('age after 5 years', df_pyspark['age']+5)

In [37]:
df_pyspark.show()

+-------+---+----------+-----------------+
|   name|age|experience|age after 5 years|
+-------+---+----------+-----------------+
|   Jijo| 25|         4|               30|
|Adharsh| 25|         5|               30|
| Sajjad| 23|         2|               28|
+-------+---+----------+-----------------+



In [38]:
# drop columns
df_pyspark = df_pyspark.drop('age after 5 years')

In [39]:
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|   Jijo| 25|         4|
|Adharsh| 25|         5|
| Sajjad| 23|         2|
+-------+---+----------+



In [40]:
df_pyspark.withColumnRenamed('name', 'rename').show()

+-------+---+----------+
| rename|age|experience|
+-------+---+----------+
|   Jijo| 25|         4|
|Adharsh| 25|         5|
| Sajjad| 23|         2|
+-------+---+----------+

