# Data Frame Basics

- PySpark Dataframe
- Reading The Dataset
- Checking the Datatypes of the Column(Schema)
- Selecting Columns And Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping columns
- Renaming Columns


In [52]:
from pyspark.sql import SparkSession

In [53]:
spark_session = SparkSession.builder.appName("Lesson 2 - Data Frame").getOrCreate()
spark_session

In [54]:
# Read the data set
data_frame = spark_session.read.option(
    'header', 'true').csv('test1.csv', inferSchema=True)
# By default reads all CSV columns as strings.
# inferSchema=True: Tease the data type of each field.
data_frame.show()


+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [55]:
# Check the Schema (Data Types)
data_frame.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [56]:
# Another way of doing the same thing
data_frame = spark_session.read.csv('test1.csv', header=True, inferSchema=True)
data_frame.show()
data_frame.printSchema()
type(data_frame)

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



pyspark.sql.dataframe.DataFrame

Data Frames are Data Structures:

- https://spark.apache.org/docs/latest/sql-programming-guide.html


In [57]:
# Get the columns
data_frame.columns

['Name', 'age', 'Experience', 'Salary']

In [58]:
# Get first two rows
data_frame.head(2)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000)]

In [59]:
# Select a column
experiences = data_frame.select('Experience')
experiences.show()
print(type(experiences))
experiences

+----------+
|Experience|
+----------+
|        10|
|         8|
|         4|
|         3|
|         1|
|         2|
+----------+

<class 'pyspark.sql.dataframe.DataFrame'>


DataFrame[Experience: int]

In [60]:
# Select multiple columns
by_salary = data_frame.select('Name', 'Salary')
by_salary.show()
print(type(by_salary))
by_salary

+---------+------+
|     Name|Salary|
+---------+------+
|    Krish| 30000|
|Sudhanshu| 25000|
|    Sunny| 20000|
|     Paul| 20000|
|   Harsha| 15000|
|  Shubham| 18000|
+---------+------+

<class 'pyspark.sql.dataframe.DataFrame'>


DataFrame[Name: string, Salary: int]

In [61]:
# Isolate a column
name_column = data_frame["Name"]

In [62]:
# Get data types (again)
data_frame.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [63]:
# Compute and show basic statistical data for data set
data_frame.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [64]:
# Add a column to data frame
data_frame = data_frame.withColumn('Experience After 2 Years', data_frame['Experience'] + 2)
data_frame.show()

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience After 2 Years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sudhanshu| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 23|         2| 18000|                       4|
+---------+---+----------+------+------------------------+



In [65]:
# Remove columns
data_frame = data_frame.drop('Experience After 2 Years')
data_frame.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [66]:
# Rename columns
data_frame = data_frame.withColumnRenamed('Name', 'New Name')
data_frame.show()

+---------+---+----------+------+
| New Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

