# Pyspark Important Functionalities 
1. Pyspark DataFrame 
2. Reading the data 
3. Check Data Types
4. Select columns and indexing
5. Check Schema
6. Adding Columns 
7. Dropping Columns 

In [1]:
import pyspark
from pyspark.sql import SparkSession

## Create Session

In [2]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()
spark

## Read the dataset

In [5]:
df_pyspark = spark.read.option('header',True).csv("heart.csv")

In [7]:
df_pyspark.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 5 rows



Each value in the below schema is showing as the String type but actually there are values which are integer, so why in schema we are having other datatype? By default pyspark infers each value as string, we need to set the **inferSchema = True.**

In [8]:
# Check the schema of the dataset 
df_pyspark.printSchema()

root
 |-- age: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- cp: string (nullable = true)
 |-- trestbps: string (nullable = true)
 |-- chol: string (nullable = true)
 |-- fbs: string (nullable = true)
 |-- restecg: string (nullable = true)
 |-- thalach: string (nullable = true)
 |-- exang: string (nullable = true)
 |-- oldpeak: string (nullable = true)
 |-- slope: string (nullable = true)
 |-- ca: string (nullable = true)
 |-- thal: string (nullable = true)
 |-- target: string (nullable = true)



 Keeping the **inferSchema = True** to get the actual datatypes in the schema.

In [9]:
df_pyspark = spark.read.option('header',True).csv("heart.csv", inferSchema=True)
df_pyspark.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [11]:
# you can include the header=True and everything in the csv column
df_pyspark = spark.read.csv("heart.csv",header=True,inferSchema=True)
df_pyspark.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [13]:
print(type(df_pyspark))

<class 'pyspark.sql.dataframe.DataFrame'>


## Columns of the dataframe

In [14]:
df_pyspark.columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

In [16]:
df_pyspark.head(3)

[Row(age=63, sex=1, cp=3, trestbps=145, chol=233, fbs=1, restecg=0, thalach=150, exang=0, oldpeak=2.3, slope=0, ca=0, thal=1, target=1),
 Row(age=37, sex=1, cp=2, trestbps=130, chol=250, fbs=0, restecg=1, thalach=187, exang=0, oldpeak=3.5, slope=0, ca=0, thal=2, target=1),
 Row(age=41, sex=0, cp=1, trestbps=130, chol=204, fbs=0, restecg=0, thalach=172, exang=0, oldpeak=1.4, slope=2, ca=0, thal=2, target=1)]

## Select a pyspark column

In [19]:
df_pyspark.select("age").show(5)

+---+
|age|
+---+
| 63|
| 37|
| 41|
| 56|
| 57|
+---+
only showing top 5 rows



In [20]:
df_pyspark.select(["age","sex"]).show(5)

+---+---+
|age|sex|
+---+---+
| 63|  1|
| 37|  1|
| 41|  0|
| 56|  1|
| 57|  0|
+---+---+
only showing top 5 rows



In [21]:
df_pyspark.dtypes

[('age', 'int'),
 ('sex', 'int'),
 ('cp', 'int'),
 ('trestbps', 'int'),
 ('chol', 'int'),
 ('fbs', 'int'),
 ('restecg', 'int'),
 ('thalach', 'int'),
 ('exang', 'int'),
 ('oldpeak', 'double'),
 ('slope', 'int'),
 ('ca', 'int'),
 ('thal', 'int'),
 ('target', 'int')]

## Add and Drop Columns 

In [23]:
df_pyspark = df_pyspark.withColumn("Age after 2 years ",df_pyspark["Age"]+2,)

In [24]:
df_pyspark.columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target',
 'Age after 2 years ']

In [27]:
df_pyspark.select("Age after 2 years ").show()

+------------------+
|Age after 2 years |
+------------------+
|                65|
|                39|
|                43|
|                58|
|                59|
|                59|
|                58|
|                46|
|                54|
|                59|
|                56|
|                50|
|                51|
|                66|
|                60|
|                52|
|                60|
|                68|
|                45|
|                71|
+------------------+
only showing top 20 rows



In [28]:
df_pyspark = df_pyspark.drop("Age after 2 years ")

In [29]:
df_pyspark.columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

## Renaming a Column

In [31]:
df_pyspark = df_pyspark.withColumnRenamed("Age","Meri Age")

In [32]:
df_pyspark.columns

['Meri Age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

In [33]:
df_pyspark.select("Meri Age").show()

+--------+
|Meri Age|
+--------+
|      63|
|      37|
|      41|
|      56|
|      57|
|      57|
|      56|
|      44|
|      52|
|      57|
|      54|
|      48|
|      49|
|      64|
|      58|
|      50|
|      58|
|      66|
|      43|
|      69|
+--------+
only showing top 20 rows

