In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.getOrCreate()

In [None]:
#Load Data
df_train = spark.read.option("inferschema", "true"). csv("/content/Classification_Test.csv", header=True)
df_test = spark.read.option("inferschema", "true"). csv("/content/Classification_Train.csv", header=True)

df_train.show(3)
df_test.show(3)

+----------------+------+------+---------------+---------+-------+-------------+---------+
|            Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+----------------+------+------+---------------+---------+-------+-------------+---------+
|    Lila Bracher|  Male|   151|            Low|    Black|     No|     74000000|      Yes|
|Archibaldo Bigly|  Male|   162|           High|     Blue|    Yes|     29000000|       No|
|    Dion Stopher|  Male|   155|   Intermediate|    Brown|    Yes|    134000000|       No|
+----------------+------+------+---------------+---------+-------+-------------+---------+
only showing top 3 rows

+--------------+------+------+---------------+---------+-------+-------------+---------+
|          Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+--------------+------+------+---------------+---------+-------+-------------+---------+
| Sax Tesseyman|Female|   174|   Intermediate|     Blue|    Yes|     85

In [None]:
#Select Features
df_train = df_train.select("Married", "Education Level", "Salary Income", "Depressed")
df_test = df_test.select("Married", "Education Level", "Salary Income", "Depressed")

df_train.show(3)
df_test.show(3)

+-------+---------------+-------------+---------+
|Married|Education Level|Salary Income|Depressed|
+-------+---------------+-------------+---------+
|     No|            Low|     74000000|      Yes|
|    Yes|           High|     29000000|       No|
|    Yes|   Intermediate|    134000000|       No|
+-------+---------------+-------------+---------+
only showing top 3 rows

+-------+---------------+-------------+---------+
|Married|Education Level|Salary Income|Depressed|
+-------+---------------+-------------+---------+
|    Yes|   Intermediate|     85000000|       No|
|     No|   Intermediate|     14000000|       No|
|     No|            Low|    148000000|      Yes|
+-------+---------------+-------------+---------+
only showing top 3 rows



In [None]:
# Data Preprocessing
df_train = df_train.na.drop()
df_test = df_test.na.drop()

In [None]:
# Transform Data


df_train = df_train.withColumn("Married", when(df_train["Married"] == "No", 0).
            when(df_train["Married"] == "Yes", 1))

df_train = df_train.withColumn("Education Level", when(df_train["Education Level"] == "Low", 0).
                               when(df_train["Education Level"] == "Intermediate", 1).
                               when(df_train["Education Level"] == "High", 2))

df_train = df_train.withColumn("Depressed", when(df_train["Depressed"] == "No", 0).
                               when(df_train["Depressed"] == "Yes", 1))

df_test = df_test.withColumn("Married", when(df_test["Married"] == "No", 0).
            when(df_test["Married"] == "Yes", 1))

df_test = df_test.withColumn("Education Level", when(df_test["Education Level"] == "Low", 0).
                               when(df_test["Education Level"] == "Intermediate", 1).
                               when(df_test["Education Level"] == "High", 2))

df_test = df_test.withColumn("Depressed", when(df_test["Depressed"] == "No", 0).
                               when(df_test["Depressed"] == "Yes", 1))


df_train.show(3)
df_test.show(3)

+-------+---------------+-------------+---------+
|Married|Education Level|Salary Income|Depressed|
+-------+---------------+-------------+---------+
|      0|              0|     74000000|        1|
|      1|              2|     29000000|        0|
|      1|              1|    134000000|        0|
+-------+---------------+-------------+---------+
only showing top 3 rows

+-------+---------------+-------------+---------+
|Married|Education Level|Salary Income|Depressed|
+-------+---------------+-------------+---------+
|      1|              1|     85000000|        0|
|      0|              1|     14000000|        0|
|      0|              0|    148000000|        1|
+-------+---------------+-------------+---------+
only showing top 3 rows



In [None]:
# Normalization

cols = df_train.columns
cols.remove("Depressed")

assembler = VectorAssembler(inputCols= cols, outputCol="vector")
df_test = assembler.transform(df_test)
df_test.show(5)

scaler =StandardScaler(inputCol="vector", outputCol="standar_scaler")
df_test = scaler.fit(df_test).transform(df_test)d
df_test.show(5)

+-------+---------------+-------------+---------+----------------+
|Married|Education Level|Salary Income|Depressed|          vector|
+-------+---------------+-------------+---------+----------------+
|      1|              1|     85000000|        0| [1.0,1.0,8.5E7]|
|      0|              1|     14000000|        0| [0.0,1.0,1.4E7]|
|      0|              0|    148000000|        1|[0.0,0.0,1.48E8]|
|      0|              2|     50000000|        1| [0.0,2.0,5.0E7]|
|      1|              1|    101000000|        0|[1.0,1.0,1.01E8]|
+-------+---------------+-------------+---------+----------------+
only showing top 5 rows

+-------+---------------+-------------+---------+----------------+--------------------+
|Married|Education Level|Salary Income|Depressed|          vector|      standar_scaler|
+-------+---------------+-------------+---------+----------------+--------------------+
|      1|              1|     85000000|        0| [1.0,1.0,8.5E7]|[2.00042918101203...|
|      0|           

In [None]:
# Generate Model


model = LogisticRegression(featuresCol="standar_scaler", labelCol="Depressed", maxIter=10).fit(df_test)

prediction = model.transform(df_test)
prediction.select("Depressed", "standar_scaler").show(3)

+-------+---------------+-------------+---------+----------------+--------------------+--------------------+--------------------+----------+
|Married|Education Level|Salary Income|Depressed|          vector|      standar_scaler|       rawPrediction|         probability|prediction|
+-------+---------------+-------------+---------+----------------+--------------------+--------------------+--------------------+----------+
|      1|              1|     85000000|        0| [1.0,1.0,8.5E7]|[2.00042918101203...|[1.85289437098755...|[0.86446657727832...|       0.0|
|      0|              1|     14000000|        0| [0.0,1.0,1.4E7]|[0.0,1.2959574236...|[-2.1776527379103...|[0.10177530729006...|       1.0|
|      0|              0|    148000000|        1|[0.0,0.0,1.48E8]|[0.0,0.0,3.572913...|[-0.7377856531176...|[0.32348855011231...|       1.0|
+-------+---------------+-------------+---------+----------------+--------------------+--------------------+--------------------+----------+
only showing 

In [None]:
# Model Testing and Evaluation
evaluator = BinaryClassificationEvaluator(labelCol="Depressed")
print("Accuracy {}%".format(evaluator.evaluate(prediction)* 100))