In [1]:
from pyspark.sql import SparkSession
import pandas as pd

## Opening CSV file in Pandas

In [2]:
pd_df = pd.read_csv('/kaggle/input/employee-dataset/Employee.csv')
pd_df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [3]:
pd_df.dtypes

Education                    object
JoiningYear                   int64
City                         object
PaymentTier                   int64
Age                           int64
Gender                       object
EverBenched                  object
ExperienceInCurrentDomain     int64
LeaveOrNot                    int64
dtype: object

## Opening CSV file in Spark

In [4]:
spark = (
    SparkSession
        .builder
        .appName('PySpark Practice')
        .master('local[*]')
        .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/24 19:28:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = spark.read.format("csv").option("header", True).option("inferSchema", True).load('/kaggle/input/employee-dataset/Employee.csv')
df

DataFrame[Education: string, JoiningYear: int, City: string, PaymentTier: int, Age: int, Gender: string, EverBenched: string, ExperienceInCurrentDomain: int, LeaveOrNot: int]

In [6]:
df.printSchema()

root
 |-- Education: string (nullable = true)
 |-- JoiningYear: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentTier: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EverBenched: string (nullable = true)
 |-- ExperienceInCurrentDomain: integer (nullable = true)
 |-- LeaveOrNot: integer (nullable = true)



In [7]:
df.show()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Bachelors|       2017|Bangalore|          3| 34|  Male|         No|                        0|         0|
|Bachelors|       2013|     Pune|          1| 28|Female|         No|                        3|         1|
|Bachelors|       2014|New Delhi|          3| 38|Female|         No|                        2|         0|
|  Masters|       2016|Bangalore|          3| 27|  Male|         No|                        5|         1|
|  Masters|       2017|     Pune|          3| 24|  Male|        Yes|                        2|         1|
|Bachelors|       2016|Bangalore|          3| 22|  Male|         No|                        0|         0|
|Bachelors|       2015|New Delhi|          3| 

In [8]:
df.rdd.getNumPartitions()

1

In [9]:
df_filtered = df.where("Age > 30")
df_filtered

DataFrame[Education: string, JoiningYear: int, City: string, PaymentTier: int, Age: int, Gender: string, EverBenched: string, ExperienceInCurrentDomain: int, LeaveOrNot: int]

- #### Data only shows up when collected

In [10]:
df_filtered.show()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Bachelors|       2017|Bangalore|          3| 34|  Male|         No|                        0|         0|
|Bachelors|       2014|New Delhi|          3| 38|Female|         No|                        2|         0|
|Bachelors|       2015|New Delhi|          3| 38|  Male|         No|                        0|         0|
|Bachelors|       2016|Bangalore|          3| 34|Female|         No|                        2|         1|
|  Masters|       2017|New Delhi|          2| 37|  Male|         No|                        2|         0|
|Bachelors|       2016|     Pune|          3| 34|  Male|         No|                        3|         0|
|Bachelors|       2018|     Pune|          3| 

In [11]:
df_filtered.write.format('csv').save('/kaggle/working/df_filtered.csv')

In [12]:
df.City, df['Age']

(Column<'City'>, Column<'Age'>)

In [13]:
df.select(df.City, df['Age']).show()

+---------+---+
|     City|Age|
+---------+---+
|Bangalore| 34|
|     Pune| 28|
|New Delhi| 38|
|Bangalore| 27|
|     Pune| 24|
|Bangalore| 22|
|New Delhi| 38|
|Bangalore| 34|
|     Pune| 23|
|New Delhi| 37|
|Bangalore| 27|
|     Pune| 34|
|     Pune| 32|
|Bangalore| 39|
|Bangalore| 37|
|Bangalore| 29|
|Bangalore| 34|
|     Pune| 34|
|     Pune| 30|
|New Delhi| 22|
+---------+---+
only showing top 20 rows

