creating spark session and reading the csv file

In [25]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ReadCSV").getOrCreate()
df = spark.read.csv("/content/college_student_placement_dataset.csv", header=True, inferSchema=True)


In [26]:
df.show()

+----------+---+---------------+-----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result| CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+-----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0030|107|           6.61| 6.28|                   8|                   No|                     8|                   8|                 4|       No|
|   CLG0061| 97|           5.52| 5.37|                   8|                   No|                     7|                   8|                 0|       No|
|   CLG0036|109|           5.36| 5.83|                   9|                   No|                     3|                   1|                 1|       No|
|   CLG0055|122|           5.47| 5.75|                   6|           

count the missing value

In [27]:
from pyspark.sql.functions import col, count, when
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()


+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|         0|  0|              0|   0|                   0|                    0|                     0|                   0|                 0|        0|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+



checking the datatypes fixing datatype. converting to number  convert yes no to 0 1

In [28]:
from pyspark.sql.functions import when

df = df.withColumn("Internship_Experience", when(col("Internship_Experience") == "Yes", 1).otherwise(0))
df = df.withColumn("Placement", when(col("Placement") == "Yes", 1).otherwise(0))
df.show()

+----------+---+---------------+-----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result| CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+-----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0030|107|           6.61| 6.28|                   8|                    0|                     8|                   8|                 4|        0|
|   CLG0061| 97|           5.52| 5.37|                   8|                    0|                     7|                   8|                 0|        0|
|   CLG0036|109|           5.36| 5.83|                   9|                    0|                     3|                   1|                 1|        0|
|   CLG0055|122|           5.47| 5.75|                   6|           

In [None]:
df.count() - df.dropDuplicates().count() #check duplicate rows


0

renaming the few coulumns


In [29]:
df = df.withColumnRenamed("Prev_Sem_Result", "Previous_Sem_Percentage") \
       .withColumnRenamed("Projects_Completed", "Project_Count") \
       .withColumnRenamed("Placement_Status", "Placed")
df.show(20)

+----------+---+-----------------------+-----+--------------------+---------------------+----------------------+--------------------+-------------+---------+
|College_ID| IQ|Previous_Sem_Percentage| CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Project_Count|Placement|
+----------+---+-----------------------+-----+--------------------+---------------------+----------------------+--------------------+-------------+---------+
|   CLG0030|107|                   6.61| 6.28|                   8|                    0|                     8|                   8|            4|        0|
|   CLG0061| 97|                   5.52| 5.37|                   8|                    0|                     7|                   8|            0|        0|
|   CLG0036|109|                   5.36| 5.83|                   9|                    0|                     3|                   1|            1|        0|
|   CLG0055|122|                   5.47| 5.75|      

In [31]:
from pyspark.sql.functions import round, expr

df = df.withColumn(
    "Average_Score",
    round((col("CGPA") + col("Previous_Sem_Percentage") + col("IQ")) / 3, 2)
)
df.show()

+----------+---+-----------------------+-----+--------------------+---------------------+----------------------+--------------------+-------------+---------+-------------+
|College_ID| IQ|Previous_Sem_Percentage| CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Project_Count|Placement|Average_Score|
+----------+---+-----------------------+-----+--------------------+---------------------+----------------------+--------------------+-------------+---------+-------------+
|   CLG0030|107|                   6.61| 6.28|                   8|                    0|                     8|                   8|            4|        0|        39.96|
|   CLG0061| 97|                   5.52| 5.37|                   8|                    0|                     7|                   8|            0|        0|        35.96|
|   CLG0036|109|                   5.36| 5.83|                   9|                    0|                     3|                   1|       

Loading the file to new Csv File

converting the filtered data and load to a new csv file

In [32]:
from pyspark.sql.functions import when

df = df.withColumn("Placement", when(col("Placement") == 1, "Placed").otherwise("Not Placed"))
df.show()

+----------+---+-----------------------+-----+--------------------+---------------------+----------------------+--------------------+-------------+----------+-------------+
|College_ID| IQ|Previous_Sem_Percentage| CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Project_Count| Placement|Average_Score|
+----------+---+-----------------------+-----+--------------------+---------------------+----------------------+--------------------+-------------+----------+-------------+
|   CLG0030|107|                   6.61| 6.28|                   8|                    0|                     8|                   8|            4|Not Placed|        39.96|
|   CLG0061| 97|                   5.52| 5.37|                   8|                    0|                     7|                   8|            0|Not Placed|        35.96|
|   CLG0036|109|                   5.36| 5.83|                   9|                    0|                     3|                   1|  

In [33]:
import shutil
import os
df.coalesce(1).write.csv("college_output_single", header=True, mode="overwrite")



# Find the part file inside the folder
for file in os.listdir("college_output_single"):
    if file.startswith("part") and file.endswith(".csv"):
        shutil.move(f"college_output_single/{file}", "FinalCollegeDetails.csv")
        break
