<a href="https://colab.research.google.com/github/hargagan/EDA-NYC-Taxi-Data-Analysis/blob/main/pyspark/eda_iris_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
import time
spark = SparkSession.builder.appName('FileTypeComparison').getOrCreate()



In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [29]:
csv_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Assignments/EDA/csv_data_iris.csv")
json_df = spark.read.json("/content/drive/MyDrive/Assignments/EDA/json_data_iris.json")
parquet_df = spark.read.parquet("/content/drive/MyDrive/Assignments/EDA/parquet_data_iris.parquet")

In [5]:
print("CSV Data Schema:")
csv_df.printSchema()
print("JSON Data Schema:")
json_df.printSchema()
print("Parquet Data Schema:")
parquet_df.printSchema()

CSV Data Schema:
root
 |-- sepal length (cm): string (nullable = true)
 |-- sepal width (cm): string (nullable = true)
 |-- petal length (cm): string (nullable = true)
 |-- petal width (cm): string (nullable = true)
 |-- target: string (nullable = true)
 |-- class: string (nullable = true)

JSON Data Schema:
root
 |-- class: string (nullable = true)
 |-- petal length (cm): double (nullable = true)
 |-- petal width (cm): double (nullable = true)
 |-- sepal length (cm): double (nullable = true)
 |-- sepal width (cm): double (nullable = true)
 |-- target: long (nullable = true)

Parquet Data Schema:
root
 |-- sepal length (cm): double (nullable = true)
 |-- sepal width (cm): double (nullable = true)
 |-- petal length (cm): double (nullable = true)
 |-- petal width (cm): double (nullable = true)
 |-- target: long (nullable = true)
 |-- class: string (nullable = true)



In [7]:
print("CSV Data Summary:")
csv_df.describe().show()
print("JSON Data Summary:")
json_df.describe().show()
print("Parquet Data Summary:")
parquet_df.describe().show()

CSV Data Summary:
+-------+------------------+-------------------+------------------+------------------+------------------+---------+
|summary| sepal length (cm)|   sepal width (cm)| petal length (cm)|  petal width (cm)|            target|    class|
+-------+------------------+-------------------+------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|               1.0|     NULL|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|0.8192319205190406|     NULL|
|    min|               4.3|                2.0|               1.0|               0.1|                 0|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|                 2|virginica|
+-------+------------------+-------------------+------

In [8]:
# print("CSV Data Count:")
csv_count = csv_df.count()
# print("JSON Data Count:")
json_count = json_df.count()
# print("Parquet Data Count:")
parquet_count = parquet_df.count()
print("CSV Data Count:", csv_count)
print("JSON Data Count:", json_count)
print("Parquet Data Count:", parquet_count)

CSV Data Count: 150
JSON Data Count: 150
Parquet Data Count: 150


In [9]:
print("First 5 rows of CSV Data:")
csv_df.show(5)
print("First 5 rows of JSON Data:")
json_df.show(5)
print("First 5 rows of Parquet Data:")
parquet_df.show(5)

First 5 rows of CSV Data:
+-----------------+----------------+-----------------+----------------+------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target| class|
+-----------------+----------------+-----------------+----------------+------+------+
|              5.1|             3.5|              1.4|             0.2|     0|setosa|
|              4.9|             3.0|              1.4|             0.2|     0|setosa|
|              4.7|             3.2|              1.3|             0.2|     0|setosa|
|              4.6|             3.1|              1.5|             0.2|     0|setosa|
|              5.0|             3.6|              1.4|             0.2|     0|setosa|
+-----------------+----------------+-----------------+----------------+------+------+
only showing top 5 rows
First 5 rows of JSON Data:
+------+-----------------+----------------+-----------------+----------------+------+
| class|petal length (cm)|petal width (cm)|sepal length (cm)|se

In [10]:
csv_df.write.parquet("/content/drive/MyDrive/Assignments/EDA/csv_converted_to_parquet.parquet")


In [11]:
converted_df = spark.read.parquet("/content/drive/MyDrive/Assignments/EDA/csv_converted_to_parquet.parquet")

In [12]:
converted_df.printSchema()

root
 |-- sepal length (cm): string (nullable = true)
 |-- sepal width (cm): string (nullable = true)
 |-- petal length (cm): string (nullable = true)
 |-- petal width (cm): string (nullable = true)
 |-- target: string (nullable = true)
 |-- class: string (nullable = true)



In [13]:
converted_df.show(5)

+-----------------+----------------+-----------------+----------------+------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target| class|
+-----------------+----------------+-----------------+----------------+------+------+
|              5.1|             3.5|              1.4|             0.2|     0|setosa|
|              4.9|             3.0|              1.4|             0.2|     0|setosa|
|              4.7|             3.2|              1.3|             0.2|     0|setosa|
|              4.6|             3.1|              1.5|             0.2|     0|setosa|
|              5.0|             3.6|              1.4|             0.2|     0|setosa|
+-----------------+----------------+-----------------+----------------+------+------+
only showing top 5 rows


In [14]:
start = time.time()
spark.read.csv("/content/drive/MyDrive/Assignments/EDA/csv_data_iris.csv")
end = time.time()
print("Time taken to read CSV data:", end - start)

Time taken to read CSV data: 0.32649970054626465


In [15]:
start = time.time()
spark.read.csv("/content/drive/MyDrive/Assignments/EDA/json_data_iris.json")
end = time.time()
print("Time taken to read JSON data:", end - start)

Time taken to read JSON data: 0.5117824077606201


In [16]:
start = time.time()
spark.read.csv("/content/drive/MyDrive/Assignments/EDA/parquet_data_iris.parquet")
end = time.time()
print("Time taken to read Parquet data:", end - start)

Time taken to read Parquet data: 0.7168805599212646


In [17]:
for column in csv_df.columns:
    print(column)

sepal length (cm)
sepal width (cm)
petal length (cm)
petal width (cm)
target
class


In [30]:
iris_csv_spark = csv_df.withColumnRenamed("sepal length (cm)", "sepal_length").withColumnRenamed("sepal width (cm)", "sepal_width").withColumnRenamed("petal length (cm)", "petal_length").withColumnRenamed("petal width (cm)", "petal_width")

In [31]:
iris_csv_spark.show(5)

+------------+-----------+------------+-----------+------+------+
|sepal_length|sepal_width|petal_length|petal_width|target| class|
+------------+-----------+------------+-----------+------+------+
|         5.1|        3.5|         1.4|        0.2|     0|setosa|
|         4.9|        3.0|         1.4|        0.2|     0|setosa|
|         4.7|        3.2|         1.3|        0.2|     0|setosa|
|         4.6|        3.1|         1.5|        0.2|     0|setosa|
|         5.0|        3.6|         1.4|        0.2|     0|setosa|
+------------+-----------+------------+-----------+------+------+
only showing top 5 rows


In [33]:
df = iris_csv_spark.select("sepal_length", "sepal_width", "petal_length", "petal_width", "class")

In [34]:
df.show(5)

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width| class|
+------------+-----------+------------+-----------+------+
|         5.1|        3.5|         1.4|        0.2|setosa|
|         4.9|        3.0|         1.4|        0.2|setosa|
|         4.7|        3.2|         1.3|        0.2|setosa|
|         4.6|        3.1|         1.5|        0.2|setosa|
|         5.0|        3.6|         1.4|        0.2|setosa|
+------------+-----------+------------+-----------+------+
only showing top 5 rows


In [35]:
df.createOrReplaceGlobalTempView("iris")

In [37]:
filtered = spark.sql("SELECT * FROM global_temp.iris WHERE petal_length > 1.50")
filtered.show(5)

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width| class|
+------------+-----------+------------+-----------+------+
|         5.4|        3.9|         1.7|        0.4|setosa|
|         4.8|        3.4|         1.6|        0.2|setosa|
|         5.7|        3.8|         1.7|        0.3|setosa|
|         5.4|        3.4|         1.7|        0.2|setosa|
|         5.1|        3.3|         1.7|        0.5|setosa|
+------------+-----------+------------+-----------+------+
only showing top 5 rows


In [40]:
avg_sepal_width = spark.sql("""SELECT class, ROUND(AVG(sepal_width), 2) as avg_sepal_width
                     FROM global_temp.iris
                     GROUP BY class
                     ORDER BY class """)
avg_sepal_width.show()

+----------+---------------+
|     class|avg_sepal_width|
+----------+---------------+
|    setosa|           3.43|
|versicolor|           2.77|
| virginica|           2.97|
+----------+---------------+



In [41]:
student_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Assignments/EDA/global_student_migration.csv")

In [42]:
student_df.show(5)

+----------+--------------+-------------------+----------------+--------------------+--------------------+---------------+------------------+--------------------+-------------------+---------------+----------------+-----------------+-----------------+-------------------+------------+--------------------+--------------------+-------------------------+----------+
|student_id|origin_country|destination_country|destination_city|     university_name|         course_name| field_of_study|year_of_enrollment|scholarship_received|  enrollment_reason|graduation_year|placement_status|placement_country|placement_company|starting_salary_usd|gpa_or_score|         visa_status|post_graduation_visa|language_proficiency_test|test_score|
+----------+--------------+-------------------+----------------+--------------------+--------------------+---------------+------------------+--------------------+-------------------+---------------+----------------+-----------------+-----------------+-------------------+-

In [45]:
student_df.printSchema()

root
 |-- student_id: string (nullable = true)
 |-- origin_country: string (nullable = true)
 |-- destination_country: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- university_name: string (nullable = true)
 |-- course_name: string (nullable = true)
 |-- field_of_study: string (nullable = true)
 |-- year_of_enrollment: string (nullable = true)
 |-- scholarship_received: string (nullable = true)
 |-- enrollment_reason: string (nullable = true)
 |-- graduation_year: string (nullable = true)
 |-- placement_status: string (nullable = true)
 |-- placement_country: string (nullable = true)
 |-- placement_company: string (nullable = true)
 |-- starting_salary_usd: string (nullable = true)
 |-- gpa_or_score: string (nullable = true)
 |-- visa_status: string (nullable = true)
 |-- post_graduation_visa: string (nullable = true)
 |-- language_proficiency_test: string (nullable = true)
 |-- test_score: string (nullable = true)



In [49]:
from pyspark.sql.functions import col

filter_student_with_scholarship = student_df.filter(col("scholarship_received") == "Yes")
filter_student_with_scholarship.cache()

filter_student_with_scholarship.show(5)

+----------+--------------+-------------------+----------------+--------------------+--------------------+---------------+------------------+--------------------+-------------------+---------------+----------------+-----------------+-----------------+-------------------+------------+--------------------+--------------------+-------------------------+----------+
|student_id|origin_country|destination_country|destination_city|     university_name|         course_name| field_of_study|year_of_enrollment|scholarship_received|  enrollment_reason|graduation_year|placement_status|placement_country|placement_company|starting_salary_usd|gpa_or_score|         visa_status|post_graduation_visa|language_proficiency_test|test_score|
+----------+--------------+-------------------+----------------+--------------------+--------------------+---------------+------------------+--------------------+-------------------+---------------+----------------+-----------------+-----------------+-------------------+-

In [52]:
filter_student_with_scholarship.count()

2577

In [59]:
# Average GPA of students placed in United states
selected_df = student_df.select("student_id", "placement_country", "gpa_or_score")
avg_gpa = selected_df.groupby(col("placement_country")).agg({"gpa_or_score": "avg"})
avg_gpa.show()
avg_gpa.filter(col("placement_country") == "USA").show()

+-----------------+------------------+
|placement_country| avg(gpa_or_score)|
+-----------------+------------------+
|           Russia|  3.25509505703422|
|          Germany|3.2152244897959177|
|          Finland|3.2236440677966116|
|            India| 3.223171641791047|
|          Ireland|3.2235655737704914|
|              USA|3.2619246861924682|
|              N/A| 3.256543556804493|
|               UK|3.2173529411764683|
|              UAE|3.2739312977099253|
|           Canada|  3.27264822134387|
|     South Africa|3.2009691629955954|
+-----------------+------------------+

+-----------------+------------------+
|placement_country| avg(gpa_or_score)|
+-----------------+------------------+
|              USA|3.2619246861924682|
+-----------------+------------------+



In [79]:
# Write a function that returns the top 5 destination countries for a given year of enrolment.
def top5_counteries_for_enrollment_year(df, year):
  df = df.select("year_of_enrollment", "destination_country")
  df = df.filter(col("year_of_enrollment") == year)
  agg_df = df.groupby(col("destination_country")).agg({"destination_country": "count"})

  df = agg_df.sort(col("count(destination_country)").desc())
  df = df.limit(5)
  return df

top5_counteries_for_enrollment_year(student_df, 2023).show()
top5_counteries_for_enrollment_year(student_df, 2020).show()

+-------------------+--------------------------+
|destination_country|count(destination_country)|
+-------------------+--------------------------+
|                UAE|                       118|
|             Russia|                       113|
|                 UK|                       104|
|             Canada|                       103|
|            Germany|                       102|
+-------------------+--------------------------+

+-------------------+--------------------------+
|destination_country|count(destination_country)|
+-------------------+--------------------------+
|            Ireland|                       107|
|                 UK|                       107|
|            Germany|                       105|
|            Finland|                       101|
|                USA|                       100|
+-------------------+--------------------------+



In [98]:
# Write a function that calculates the average starting salary for students who studied at
# universities with ‘Technology’ or ‘Engineering’ in their name and got placed.

def average_starting_salary(df):
  selected_df = df.select("student_id", "university_name", "starting_salary_usd", "placement_status")
  filtered_df = df.filter((col("placement_status") == "Placed") & \
          (col('university_name').contains('Technology') | col('university_name').contains('Engineering')))

  avg_salary_df = filtered_df.groupBy(col("university_name")).agg({"starting_salary_usd": "avg"})
  return avg_salary_df

average_starting_salary(student_df).show()

# student_df.printSchema()
# average_starting_salary(student_df)

+--------------------+------------------------+
|     university_name|avg(starting_salary_usd)|
+--------------------+------------------------+
|Moscow Institute ...|       93310.02564102564|
+--------------------+------------------------+



In [106]:
# Write a function that returns the average TOEFL score by origin country for students
# who were placed in a different country and took the TOEFL test.

def student_tofel_score(df):
  selected_df = df.select("placement_country", "origin_country", "language_proficiency_test", "test_score")
  filtered_df = df.filter((col("placement_country") != col("origin_country")) & (col("language_proficiency_test") == "TOEFL"))
  filtered_df = filtered_df.groupby(col("origin_country")).agg({"test_score": "avg"})
  return filtered_df

# student_df.printSchema()
# student_df.show(5)
student_tofel_score(student_df).show()

+--------------+------------------+
|origin_country|   avg(test_score)|
+--------------+------------------+
|        Russia| 7.008602150537632|
|       Germany| 7.038679245283016|
|       Finland| 7.218367346938778|
|         India| 7.123404255319148|
|       Ireland|             6.892|
|           USA| 6.984615384615381|
|            UK|7.0120370370370395|
|           UAE| 6.961176470588237|
|        Canada| 7.106382978723405|
|  South Africa|  6.64607843137255|
+--------------+------------------+

