In [1]:
## Add Pyspark to your environment
# !pip install pyspark

In [3]:
## Start a spark session
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('Practice').getOrCreate()

24/07/10 19:33:50 WARN Utils: Your hostname, Sarthaks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.106 instead (on interface en0)
24/07/10 19:33:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/10 19:33:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df_pyspark = spark.read.csv('plant_growth_data.csv')

In [6]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string]

In [11]:
df_pyspark_new = spark.read.option('header', 'true').csv('plant_growth_data.csv')

In [14]:
# To see the whole dataset
df_pyspark.show()

+---------+------------------+---------------+---------------+------------------+------------------+----------------+
|      _c0|               _c1|            _c2|            _c3|               _c4|               _c5|             _c6|
+---------+------------------+---------------+---------------+------------------+------------------+----------------+
|Soil_Type|    Sunlight_Hours|Water_Frequency|Fertilizer_Type|       Temperature|          Humidity|Growth_Milestone|
|     loam| 5.192294089205035|      bi-weekly|       chemical|31.719602410244118| 61.59186060848997|               0|
|    sandy| 4.033132702741614|         weekly|        organic| 28.91948412187396| 52.42227609891599|               1|
|     loam| 8.892768570729004|      bi-weekly|           none|23.179058888285397| 44.66053858490323|               0|
|     loam| 8.241144063085702|      bi-weekly|           none|18.465886401416917|  46.4332272684958|               0|
|    sandy| 8.374043008245923|      bi-weekly|        or

In [15]:
type(df_pyspark_new)

pyspark.sql.dataframe.DataFrame

In [18]:
df_pyspark_new.head(3)

[Row(Soil_Type='loam', Sunlight_Hours='5.192294089205035', Water_Frequency='bi-weekly', Fertilizer_Type='chemical', Temperature='31.719602410244118', Humidity='61.59186060848997', Growth_Milestone='0'),
 Row(Soil_Type='sandy', Sunlight_Hours='4.033132702741614', Water_Frequency='weekly', Fertilizer_Type='organic', Temperature='28.91948412187396', Humidity='52.42227609891599', Growth_Milestone='1'),
 Row(Soil_Type='loam', Sunlight_Hours='8.892768570729004', Water_Frequency='bi-weekly', Fertilizer_Type='none', Temperature='23.179058888285397', Humidity='44.66053858490323', Growth_Milestone='0')]

In [19]:
df_pyspark_new.printSchema()

root
 |-- Soil_Type: string (nullable = true)
 |-- Sunlight_Hours: string (nullable = true)
 |-- Water_Frequency: string (nullable = true)
 |-- Fertilizer_Type: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Humidity: string (nullable = true)
 |-- Growth_Milestone: string (nullable = true)



# Points getting covered - 
1. Pyspark DataFrame
2. Reading Dataset
3. Checking datatypes (schema)
4. Selecting columns and indexing
5. 'Describe' option similar to Pandas
6. Adding Columns
7. Dropping columns 

In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Practice_DataFrame').getOrCreate()

24/07/10 19:42:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [26]:
# Reading the dataset

df_pyspark = spark.read.option('header','true').csv('plant_growth_data.csv', inferSchema=True)

In [27]:
## Check the schema

df_pyspark.printSchema()

root
 |-- Soil_Type: string (nullable = true)
 |-- Sunlight_Hours: double (nullable = true)
 |-- Water_Frequency: string (nullable = true)
 |-- Fertilizer_Type: string (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Growth_Milestone: integer (nullable = true)



In [28]:
df_pyspark=spark.read.csv('plant_growth_data.csv', header=True, inferSchema=True)

df_pyspark.show()

+---------+------------------+---------------+---------------+------------------+------------------+----------------+
|Soil_Type|    Sunlight_Hours|Water_Frequency|Fertilizer_Type|       Temperature|          Humidity|Growth_Milestone|
+---------+------------------+---------------+---------------+------------------+------------------+----------------+
|     loam| 5.192294089205035|      bi-weekly|       chemical|31.719602410244118| 61.59186060848997|               0|
|    sandy| 4.033132702741614|         weekly|        organic| 28.91948412187396| 52.42227609891599|               1|
|     loam| 8.892768570729004|      bi-weekly|           none|23.179058888285397| 44.66053858490323|               0|
|     loam| 8.241144063085702|      bi-weekly|           none|18.465886401416917|  46.4332272684958|               0|
|    sandy| 8.374043008245923|      bi-weekly|        organic| 18.12874085342172| 63.62592280385192|               0|
|    sandy| 8.627622080115675|      bi-weekly|          

In [29]:
df_pyspark.printSchema()

root
 |-- Soil_Type: string (nullable = true)
 |-- Sunlight_Hours: double (nullable = true)
 |-- Water_Frequency: string (nullable = true)
 |-- Fertilizer_Type: string (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Growth_Milestone: integer (nullable = true)



In [30]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [32]:
df_pyspark.head(3)

[Row(Soil_Type='loam', Sunlight_Hours=5.192294089205035, Water_Frequency='bi-weekly', Fertilizer_Type='chemical', Temperature=31.719602410244118, Humidity=61.59186060848997, Growth_Milestone=0),
 Row(Soil_Type='sandy', Sunlight_Hours=4.033132702741614, Water_Frequency='weekly', Fertilizer_Type='organic', Temperature=28.91948412187396, Humidity=52.42227609891599, Growth_Milestone=1),
 Row(Soil_Type='loam', Sunlight_Hours=8.892768570729004, Water_Frequency='bi-weekly', Fertilizer_Type='none', Temperature=23.179058888285397, Humidity=44.66053858490323, Growth_Milestone=0)]

In [37]:
df_pyspark.show(5)

+---------+-----------------+---------------+---------------+------------------+-----------------+----------------+
|Soil_Type|   Sunlight_Hours|Water_Frequency|Fertilizer_Type|       Temperature|         Humidity|Growth_Milestone|
+---------+-----------------+---------------+---------------+------------------+-----------------+----------------+
|     loam|5.192294089205035|      bi-weekly|       chemical|31.719602410244118|61.59186060848997|               0|
|    sandy|4.033132702741614|         weekly|        organic| 28.91948412187396|52.42227609891599|               1|
|     loam|8.892768570729004|      bi-weekly|           none|23.179058888285397|44.66053858490323|               0|
|     loam|8.241144063085702|      bi-weekly|           none|18.465886401416917| 46.4332272684958|               0|
|    sandy|8.374043008245923|      bi-weekly|        organic| 18.12874085342172|63.62592280385192|               0|
+---------+-----------------+---------------+---------------+-----------

In [40]:
# Selecting one column
df_pyspark.select('Soil_Type').show(2)

+---------+
|Soil_Type|
+---------+
|     loam|
|    sandy|
+---------+
only showing top 2 rows



In [41]:
# Selecting two columns
df_pyspark.select(['Soil_Type', 'Sunlight_Hours']).show(2)

+---------+-----------------+
|Soil_Type|   Sunlight_Hours|
+---------+-----------------+
|     loam|5.192294089205035|
|    sandy|4.033132702741614|
+---------+-----------------+
only showing top 2 rows



In [43]:
# This won't work
df_pyspark['Soil_Type']

Column<'Soil_Type'>

In [44]:
df_pyspark.dtypes

[('Soil_Type', 'string'),
 ('Sunlight_Hours', 'double'),
 ('Water_Frequency', 'string'),
 ('Fertilizer_Type', 'string'),
 ('Temperature', 'double'),
 ('Humidity', 'double'),
 ('Growth_Milestone', 'int')]

In [46]:
df_pyspark.describe().show()

24/07/10 19:53:25 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+---------+------------------+---------------+---------------+------------------+------------------+-------------------+
|summary|Soil_Type|    Sunlight_Hours|Water_Frequency|Fertilizer_Type|       Temperature|          Humidity|   Growth_Milestone|
+-------+---------+------------------+---------------+---------------+------------------+------------------+-------------------+
|  count|      193|               193|            193|            193|               193|               193|                193|
|   mean|     NULL|6.8264843391026435|           NULL|           NULL| 25.07608654204958| 58.09892681973317|0.49740932642487046|
| stddev|     NULL|1.5995085200445005|           NULL|           NULL|5.3541699082740335| 12.63179883787259| 0.5012936632230302|
|    min|     clay| 4.033132702741614|      bi-weekly|       chemical|              15.2|30.567682238370953|                  0|
|    max|    sandy| 9.913902724663604|         weekly|        organic| 34.81010284001347| 79.6482

In [47]:
## Adding a new column to the dataframe
df_pyspark = df_pyspark.withColumn('Sunlight_Minutes', df_pyspark['Sunlight_Hours'] * 60)

In [48]:
df_pyspark.show()

+---------+------------------+---------------+---------------+------------------+------------------+----------------+------------------+
|Soil_Type|    Sunlight_Hours|Water_Frequency|Fertilizer_Type|       Temperature|          Humidity|Growth_Milestone|  Sunlight_Minutes|
+---------+------------------+---------------+---------------+------------------+------------------+----------------+------------------+
|     loam| 5.192294089205035|      bi-weekly|       chemical|31.719602410244118| 61.59186060848997|               0| 311.5376453523021|
|    sandy| 4.033132702741614|         weekly|        organic| 28.91948412187396| 52.42227609891599|               1|241.98796216449688|
|     loam| 8.892768570729004|      bi-weekly|           none|23.179058888285397| 44.66053858490323|               0| 533.5661142437402|
|     loam| 8.241144063085702|      bi-weekly|           none|18.465886401416917|  46.4332272684958|               0|494.46864378514215|
|    sandy| 8.374043008245923|      bi-we

In [49]:
## Drop the columns
df_pyspark = df_pyspark.drop('Sunlight_Minutes')

In [50]:
df_pyspark.show(5)

+---------+-----------------+---------------+---------------+------------------+-----------------+----------------+
|Soil_Type|   Sunlight_Hours|Water_Frequency|Fertilizer_Type|       Temperature|         Humidity|Growth_Milestone|
+---------+-----------------+---------------+---------------+------------------+-----------------+----------------+
|     loam|5.192294089205035|      bi-weekly|       chemical|31.719602410244118|61.59186060848997|               0|
|    sandy|4.033132702741614|         weekly|        organic| 28.91948412187396|52.42227609891599|               1|
|     loam|8.892768570729004|      bi-weekly|           none|23.179058888285397|44.66053858490323|               0|
|     loam|8.241144063085702|      bi-weekly|           none|18.465886401416917| 46.4332272684958|               0|
|    sandy|8.374043008245923|      bi-weekly|        organic| 18.12874085342172|63.62592280385192|               0|
+---------+-----------------+---------------+---------------+-----------

In [52]:
## Rename the column
df_pyspark = df_pyspark.withColumnRenamed('Growth_Milestone', 'Growth_milestone')

In [53]:
df_pyspark

DataFrame[Soil_Type: string, Sunlight_Hours: double, Water_Frequency: string, Fertilizer_Type: string, Temperature: double, Humidity: double, Growth_milestone: int]