In [25]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:

from pyspark.sql import SparkSession


In [17]:
spark = SparkSession.builder.master("local").appName("DatasetLoader").getOrCreate()


In [18]:
df = spark.read.csv("Student Mental health.csv", header=True, inferSchema=True)


In [50]:
df.show(n=1000)


+-------------------+------+----+--------------------+----------+------------+--------------+--------------+-----------+----------------+--------------------+
|          Timestamp|Gender| Age|              Course|Study Year|        CGPA|Marital status|Has Depression|Has Anxiety|Has Panic Attack|Specialist Treatment|
+-------------------+------+----+--------------------+----------+------------+--------------+--------------+-----------+----------------+--------------------+
|     8/7/2020 12:02|Female|  18|         Engineering|    Year 1| 3.00 - 3.49|            No|           Yes|         No|             Yes|                  No|
|     8/7/2020 12:04|  Male|  21|   Islamic education|    Year 2| 3.00 - 3.49|            No|            No|        Yes|              No|                  No|
|     8/7/2020 12:05|  Male|  19|                 BIT|    Year 1| 3.00 - 3.49|            No|           Yes|        Yes|             Yes|                  No|
|     8/7/2020 12:06|Female|  22|             

In [None]:
new_column_names = {
    "Choose your gender": "Gender",
    "Your current year of Study": "Study Year",
    "Do you have Depression?": "Has Depression",
    "Do you have Anxiety?": "Has Anxiety",
    "Do you have Panic attack?": "Has Panic Attack",
    "Did you seek any specialist for a treatment?": "Specialist Treatment",
    "What is your course?": "Course",
    "What is your CGPA?" : "CGPA"
}

for old_name, new_name in new_column_names.items():
    df = df.withColumnRenamed(old_name, new_name)

df.show()


In [None]:
selected_df = df.select("Course", "Study Year", "Has Depression", "Specialist Treatment").show(n=1000)


In [None]:
from pyspark.sql.functions import initcap, col

selected_df = df.select("Course", "Study Year", "Has Depression", "Specialist Treatment")
if selected_df is not None:
    sentence_df = selected_df.withColumn("Study Year", initcap(col("Study Year")))
    sentence_df.show(n=1000)
else:
    print("The selected DataFrame is None.")


In [None]:
ordered_df = sentence_df.orderBy("Study Year").show(n=1000)


In [53]:
from pyspark.sql.functions import col, when, sum


In [64]:
df_with_count = df.withColumn("Yes_Count", 
                              (col("Has Depression") == "Yes").cast("integer") +
                              (col("Has Anxiety") == "Yes").cast("integer") +
                              (col("Has Panic attack") == "Yes").cast("integer"))

In [65]:
filtered_df = df_with_count.filter(col("Yes_Count") >= 2)


In [66]:
filtered_df.show(n=1000)

+-------------------+------+---+----------------+----------+------------+--------------+--------------+-----------+----------------+--------------------+---------+
|          Timestamp|Gender|Age|          Course|Study Year|        CGPA|Marital status|Has Depression|Has Anxiety|Has Panic Attack|Specialist Treatment|Yes_Count|
+-------------------+------+---+----------------+----------+------------+--------------+--------------+-----------+----------------+--------------------+---------+
|     8/7/2020 12:02|Female| 18|     Engineering|    Year 1| 3.00 - 3.49|            No|           Yes|         No|             Yes|                  No|        2|
|     8/7/2020 12:05|  Male| 19|             BIT|    Year 1| 3.00 - 3.49|            No|           Yes|        Yes|             Yes|                  No|        3|
|     8/7/2020 12:32|Female| 23|Pendidikan islam|    Year 2|3.50 - 4.00 |           Yes|           Yes|         No|             Yes|                  No|        2|
|     8/7/2020 1

In [67]:
row_count = filtered_df.count()
print("Number of rows:", row_count)


Number of rows: 28


In [68]:
study_year_counts = filtered_df.groupBy("Study Year").count().orderBy("Study Year")
study_year_counts.show()

+----------+-----+
|Study Year|count|
+----------+-----+
|    Year 1|   13|
|    Year 2|    7|
|    Year 3|    7|
|    Year 4|    1|
+----------+-----+



In [69]:
study_year_counts = df.groupBy("Study Year").count().orderBy("Study Year")
study_year_counts.show()

+----------+-----+
|Study Year|count|
+----------+-----+
|    Year 1|   43|
|    Year 2|   26|
|    Year 3|   24|
|    Year 4|    8|
+----------+-----+



In [70]:
treatment_df = df.filter(col("Specialist Treatment") == "Yes")

treatment_df.show()

+-------------------+------+---+-----------+----------+-----------+--------------+--------------+-----------+----------------+--------------------+
|          Timestamp|Gender|Age|     Course|Study Year|       CGPA|Marital status|Has Depression|Has Anxiety|Has Panic Attack|Specialist Treatment|
+-------------------+------+---+-----------+----------+-----------+--------------+--------------+-----------+----------------+--------------------+
|     8/7/2020 13:58|Female| 24|        BIT|    Year 3|3.50 - 4.00|           Yes|           Yes|        Yes|             Yes|                 Yes|
|     8/7/2020 14:31|  Male| 18|        BCS|    Year 2|3.50 - 4.00|           Yes|           Yes|        Yes|              No|                 Yes|
|     8/7/2020 14:56|Female| 24|Engineering|    Year 2|2.50 - 2.99|           Yes|           Yes|         No|             Yes|                 Yes|
|     8/7/2020 15:27|Female| 23|        ALA|    Year 1|2.50 - 2.99|           Yes|           Yes|         No|   

In [73]:
panic_attack_count = df.filter(col("Has Panic Attack") == "Yes").groupBy("Gender").count()
depression_count = df.filter(col("Has Depression") == "Yes").groupBy("Gender").count()
anxiety_count = df.filter(col("Has Anxiety") == "Yes").groupBy("Gender").count()

In [74]:
print("Panic Attack count by gender:")
panic_attack_count.show()
print("Depression count by gender:")
depression_count.show()
print("Anxiety count by gender:")
anxiety_count.show()

Panic Attack count by gender:
+------+-----+
|Gender|count|
+------+-----+
|Female|   25|
|  Male|    8|
+------+-----+

Depression count by gender:
+------+-----+
|Gender|count|
+------+-----+
|Female|   29|
|  Male|    6|
+------+-----+

Anxiety count by gender:
+------+-----+
|Gender|count|
+------+-----+
|Female|   24|
|  Male|   10|
+------+-----+

