
---


# **Experiment Name:** Understanding the pyspark data frame
# **Experiment No:** 02
# **Experiment Date:** 13/08/2023


---


In [4]:
# Firstly installing all the tools once again like Lab 01

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xf spark-3.4.1-bin-hadoop3.tgz

!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

spark

In [8]:
# Importing dataset to the code. The file type must be in .csv

from google.colab import files
files.upload()

Saving lab2.csv to lab2 (2).csv


{'lab2 (2).csv': b'Name,Age,Experience ,Salary\rMafiul,27,4,50000\rTanvir,19,2,30000\rPrema,25,5,60000\rDipjol,55,10,100000\rMousumi,45,15,120000\rShakib,38,20,200000\rPori Moni,16,12,10000\r\n'}

In [9]:
# For reading the dataset(.csv file) by Apache Spark. Here, "df" is not fixed. We can select any name instead of "df". For example, If we use the "bigdata" word, it will work.

df = spark.read.format('csv').option("header", "true").option("inferschema", "true").option("mode", "failfast").load("lab2.csv")

In [10]:
# Apache spark to display the schema

df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience : integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
# For showing the dataset table

df.show()

+---------+---+-----------+------+
|     Name|Age|Experience |Salary|
+---------+---+-----------+------+
|   Mafiul| 27|          4| 50000|
|   Tanvir| 19|          2| 30000|
|    Prema| 25|          5| 60000|
|   Dipjol| 55|         10|100000|
|  Mousumi| 45|         15|120000|
|   Shakib| 38|         20|200000|
|Pori Moni| 16|         12| 10000|
+---------+---+-----------+------+



In [12]:
#For knowing the type of the variable of df.

type(df)

pyspark.sql.dataframe.DataFrame

In [13]:
# For showing the top 2 rows with data.

df.head(2)

[Row(Name='Mafiul', Age=27, Experience =4, Salary=50000),
 Row(Name='Tanvir', Age=19, Experience =2, Salary=30000)]

In [14]:
# For showing specific columns from the dataset. Here, it shows the Name, Age, and Experience.

df.select(['Name', 'Age', 'Experience ']).show()

+---------+---+-----------+
|     Name|Age|Experience |
+---------+---+-----------+
|   Mafiul| 27|          4|
|   Tanvir| 19|          2|
|    Prema| 25|          5|
|   Dipjol| 55|         10|
|  Mousumi| 45|         15|
|   Shakib| 38|         20|
|Pori Moni| 16|         12|
+---------+---+-----------+



In [16]:
# To know about specific data. For example, here is an attempt to know the information about the "Name".

df['Name']

Column<'Name'>

In [17]:
# To know the data type of the first column (titles) of the dataset.

df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience ', 'int'), ('Salary', 'int')]

In [18]:
# To generate summary statistics for the columns

df.describe().show()

+-------+------+------------------+-----------------+-----------------+
|summary|  Name|               Age|      Experience |           Salary|
+-------+------+------------------+-----------------+-----------------+
|  count|     7|                 7|                7|                7|
|   mean|  null|32.142857142857146|9.714285714285714|81428.57142857143|
| stddev|  null|14.334440710268067|6.499084184567487|  64660.284417503|
|    min|Dipjol|                16|                2|            10000|
|    max|Tanvir|                55|               20|           200000|
+-------+------+------------------+-----------------+-----------------+



In [19]:
# To increment all the values in a particular column at the same rate. Here, the experience is extended to 2 years.

df=df.withColumn('Experience After 2 year', df['Experience ']+2)

In [20]:
# To show the whole table

df.show()

+---------+---+-----------+------+-----------------------+
|     Name|Age|Experience |Salary|Experience After 2 year|
+---------+---+-----------+------+-----------------------+
|   Mafiul| 27|          4| 50000|                      6|
|   Tanvir| 19|          2| 30000|                      4|
|    Prema| 25|          5| 60000|                      7|
|   Dipjol| 55|         10|100000|                     12|
|  Mousumi| 45|         15|120000|                     17|
|   Shakib| 38|         20|200000|                     22|
|Pori Moni| 16|         12| 10000|                     14|
+---------+---+-----------+------+-----------------------+



In [21]:
# To decrease anything we can use drop. Here the experience is decreased.

df = df.drop('Experience After 2 year')

In [22]:
df.show()

+---------+---+-----------+------+
|     Name|Age|Experience |Salary|
+---------+---+-----------+------+
|   Mafiul| 27|          4| 50000|
|   Tanvir| 19|          2| 30000|
|    Prema| 25|          5| 60000|
|   Dipjol| 55|         10|100000|
|  Mousumi| 45|         15|120000|
|   Shakib| 38|         20|200000|
|Pori Moni| 16|         12| 10000|
+---------+---+-----------+------+



In [23]:
# To rename any column

df.withColumnRenamed('Name', 'New Name').show()

+---------+---+-----------+------+
| New Name|Age|Experience |Salary|
+---------+---+-----------+------+
|   Mafiul| 27|          4| 50000|
|   Tanvir| 19|          2| 30000|
|    Prema| 25|          5| 60000|
|   Dipjol| 55|         10|100000|
|  Mousumi| 45|         15|120000|
|   Shakib| 38|         20|200000|
|Pori Moni| 16|         12| 10000|
+---------+---+-----------+------+

