In [None]:
# install if not in your env already!
#!pip install pyspark

In [2]:
# Import pyspark after installation
import pyspark as ps

The entry point into all functionality in Spark is the 'SparkSession' class. 

In [3]:
# Creating Spark session 
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Practise1').getOrCreate()

In [4]:
# Check if the session has been made successfully
spark

In [5]:
# Let's get our dataset using spark 
df_spark = spark.read.csv('testset1.csv')

In [6]:
# check the loaded dataframe
df_spark = spark.read.csv('testset1.csv').show()

+------+---+----------+
|   _c0|_c1|       _c2|
+------+---+----------+
|  Name|Age|Experience|
|Haroon| 25|         2|
|  khan| 26|         3|
|  Awan| 27|         4|
+------+---+----------+



In [16]:
df_spark.head()

Row(Name='Haroon', Age='25', Experience='2')

In [7]:
#let's take our first row as the header instead of excel coloumns i.e c0,c1 etc
df_spark = spark.read.option('header','true').csv('testset1.csv')

In [8]:
#only run to check the loaded dataset and then again use it without 'show' method; otherwise the type of dataframe will be 'None'
#df_spark = spark.read.option('header','true').csv('testset1.csv').show()

In [9]:
#check the 'type' of created dataframe
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [10]:
# Print the schema in a tree format
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [14]:
#select one or multiple columns by name
df_spark.select(['Name','Experience']).show()

+------+----------+
|  Name|Experience|
+------+----------+
|Haroon|         2|
|  khan|         3|
|  Awan|         4|
+------+----------+



In [24]:
df_spark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [25]:
df_spark.describe().show()

+-------+----+----+----------+
|summary|Name| Age|Experience|
+-------+----+----+----------+
|  count|   3|   3|         3|
|   mean|null|26.0|       3.0|
| stddev|null| 1.0|       1.0|
|    min|Awan|  25|         2|
|    max|khan|  27|         4|
+-------+----+----+----------+



In [20]:
#Adding a column in the dataframe; you have to reflect it to the dataframe to add it properly by assigning it to the df.
df_spark = df_spark.withColumn('Experience after 3 yrs', df_spark['Experience']+2)

In [21]:
df_spark.show()

+------+---+----------+----------------------+
|  Name|Age|Experience|Experience after 3 yrs|
+------+---+----------+----------------------+
|Haroon| 25|         2|                   4.0|
|  khan| 26|         3|                   5.0|
|  Awan| 27|         4|                   6.0|
+------+---+----------+----------------------+



In [22]:
# Drop the column
df_spark = df_spark.drop('Experience after 3 yrs')

In [28]:
# Renaming a column 
df_spark = df_spark.withColumnRenamed('Age','Ages')

In [29]:
df_spark.show()

+------+----+----------+
|  Name|Ages|Experience|
+------+----+----------+
|Haroon|  25|         2|
|  khan|  26|         3|
|  Awan|  27|         4|
+------+----+----------+



In [31]:
# Reading another csv
df2 = spark.read.csv('testset2.csv',header=True,inferSchema=True)

In [37]:
df2.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [38]:
df2.show()

+------+----+----------+
|  Name| Age|Experience|
+------+----+----------+
|Haroon|  23|      null|
|  khan|  22|         3|
|   ali|  25|         2|
| ahmad|null|         3|
| noman|  27|         2|
|  null|  28|         2|
| bilal|null|         1|
+------+----+----------+



In [40]:
#drop null values 
df2.na.drop(how='any').show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| khan| 22|         3|
|  ali| 25|         2|
|noman| 27|         2|
| null| 28|         2|
+-----+---+----------+



In [41]:
# Drop nulls focusing on a column using subset; so it will only drop the rows where it finds null in age column
df2.na.drop(how="any",subset=['Age']).show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Haroon| 23|      null|
|  khan| 22|         3|
|   ali| 25|         2|
| noman| 27|         2|
|  null| 28|         2|
+------+---+----------+



In [48]:
# Fill missing value
df2.na.fill('Missing value').show()

+------+-------------+-------------+
|  Name|          Age|   Experience|
+------+-------------+-------------+
|Haroon|           23|Missing value|
|  khan|           22|            3|
|   ali|           25|            2|
| ahmad|Missing value|            3|
| noman|           27|            2|
|  null|           28|            2|
| bilal|Missing value|            1|
+------+-------------+-------------+

