In [1]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=a79d78bfc3c0a4729b950481c59d6475374b372280ef6e33464650a142caef74
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler , StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [3]:
spark = SparkSession.builder.getOrCreate()

In [41]:
#Import Data from CSV
df_train= spark.read.option("inferSchema", "true").csv("Classification_Train.csv", header=True)
df_test = spark.read.option("inferSchema", "true").csv("Classification_Test.csv", header = True)

df_train.show(10)
df_test.show(10)


+----------------+------+------+---------------+---------+-------+-------------+---------+
|            Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+----------------+------+------+---------------+---------+-------+-------------+---------+
|   Sax Tesseyman|Female|   174|   Intermediate|     Blue|    Yes|     85000000|       No|
|     Niels Greet|  Male|   165|   Intermediate|    Black|     No|     14000000|       No|
|  Minetta Santry|Female|   160|            Low|    Black|     No|    148000000|      Yes|
|  Sherm Gossipin|Female|   144|           High|    Black|     No|     50000000|      Yes|
|Cathie Blackmuir|  Male|   168|   Intermediate|    Black|    Yes|    101000000|       No|
|  Early Cardenas|  Male|   151|            Low|    Black|    Yes|    145000000|      Yes|
|Willard Pendrick|Female|   141|   Intermediate|    Brown|     No|     55000000|      Yes|
|Penelopa Spensly|Female|   144|   Intermediate|     Blue|    Yes|     51000000|       No|

In [33]:
#Select Important Data
df_train= df_train.select("Gender", "Education Level", "Married","Salary Income", "Depressed")
df_test= df_test.select("Gender", "Education Level", "Married","Salary Income", "Depressed")
print("df_train")
df_train.show(10)
print("df_test")
df_train.show(10)

df_train
+------+---------------+-------+-------------+---------+
|Gender|Education Level|Married|Salary Income|Depressed|
+------+---------------+-------+-------------+---------+
|Female|   Intermediate|    Yes|     85000000|       No|
|  Male|   Intermediate|     No|     14000000|       No|
|Female|            Low|     No|    148000000|      Yes|
|Female|           High|     No|     50000000|      Yes|
|  Male|   Intermediate|    Yes|    101000000|       No|
|  Male|            Low|    Yes|    145000000|      Yes|
|Female|   Intermediate|     No|     55000000|      Yes|
|Female|   Intermediate|    Yes|     51000000|       No|
|Female|           High|     No|     97000000|      Yes|
|  Male|            Low|     No|     41000000|      Yes|
+------+---------------+-------+-------------+---------+
only showing top 10 rows

df_test
+------+---------------+-------+-------------+---------+
|Gender|Education Level|Married|Salary Income|Depressed|
+------+---------------+-------+-------------

In [34]:
#Drop Blank Data
df_train = df_train.na.drop()
df_test = df_train.na.drop()

**Data Cleasing**

In [35]:
#Transform Data
#df_train
df_train = df_train.withColumn("Education Level", when(df_train["Education Level"] == "Low",0)
                                                  .when(df_train["Education Level"] == "Intermediate",1)
                                                  .when(df_train["Education Level"] == "High",2))

df_train = df_train.withColumn("Married", when(df_train["Married"] == "No",0)
                                          .when(df_train["Married"] == "Yes",1))

df_train = df_train.withColumn("Gender", when(df_train["Gender"] == "Male",0)
                                        .when(df_train["Gender"] == "Female",1))

df_train = df_train.withColumn("Depressed", when(df_train["Depressed"] == "No",0)
                                        .when(df_train["Depressed"] == "Yes",1))


#df_test
df_test = df_test.withColumn("Education Level", when(df_test["Education Level"] == "Low",0)
                                                  .when(df_test["Education Level"] == "Intermediate",1)
                                                  .when(df_test["Education Level"] == "High",2))

df_test = df_test.withColumn("Married", when(df_test["Married"] == "No",0)
                                          .when(df_test["Married"] == "Yes",1))

df_test = df_test.withColumn("Gender", when(df_test["Gender"] == "Male",0)
                                        .when(df_test["Gender"] == "Female",1))

df_test = df_test.withColumn("Depressed", when(df_test["Depressed"] == "No",0)
                                        .when(df_test["Depressed"] == "Yes",1))
print("df_train")
df_train.show(5)
print("df_test")
df_test.show(5)

df_train
+------+---------------+-------+-------------+---------+
|Gender|Education Level|Married|Salary Income|Depressed|
+------+---------------+-------+-------------+---------+
|     1|              1|      1|     85000000|        0|
|     0|              1|      0|     14000000|        0|
|     1|              0|      0|    148000000|        1|
|     1|              2|      0|     50000000|        1|
|     0|              1|      1|    101000000|        0|
+------+---------------+-------+-------------+---------+
only showing top 5 rows

df_test
+------+---------------+-------+-------------+---------+
|Gender|Education Level|Married|Salary Income|Depressed|
+------+---------------+-------+-------------+---------+
|     1|              1|      1|     85000000|        0|
|     0|              1|      0|     14000000|        0|
|     1|              0|      0|    148000000|        1|
|     1|              2|      0|     50000000|        1|
|     0|              1|      1|    101000000|

In [36]:
#Normalization

#Normalization - df_train
trainColumns = df_train.columns
trainColumns.remove("Depressed")

df_train = VectorAssembler(inputCols= trainColumns, outputCol ="Features").transform(df_train)
df_train = StandardScaler(inputCol="Features", outputCol ="NormalizedFeature").fit(df_train).transform(df_train)
df_train.show(5)

#Normalization - df_test
trainColumns = df_test.columns
trainColumns.remove("Depressed")

df_test = VectorAssembler(inputCols= trainColumns, outputCol ="Features").transform(df_test)
df_test = StandardScaler(inputCol="Features", outputCol ="NormalizedFeature").fit(df_test).transform(df_test)
df_test.show(5)

+------+---------------+-------+-------------+---------+--------------------+--------------------+
|Gender|Education Level|Married|Salary Income|Depressed|            Features|   NormalizedFeature|
+------+---------------+-------+-------------+---------+--------------------+--------------------+
|     1|              1|      1|     85000000|        0| [1.0,1.0,1.0,8.5E7]|[1.99995775711396...|
|     0|              1|      0|     14000000|        0| [0.0,1.0,0.0,1.4E7]|[0.0,1.2959574236...|
|     1|              0|      0|    148000000|        1|[1.0,0.0,0.0,1.48E8]|[1.99995775711396...|
|     1|              2|      0|     50000000|        1| [1.0,2.0,0.0,5.0E7]|[1.99995775711396...|
|     0|              1|      1|    101000000|        0|[0.0,1.0,1.0,1.01E8]|[0.0,1.2959574236...|
+------+---------------+-------+-------------+---------+--------------------+--------------------+
only showing top 5 rows

+------+---------------+-------+-------------+---------+--------------------+-------

**Model Classification & PredictResult**

In [37]:
#Create Model Classification
model = LogisticRegression(featuresCol="NormalizedFeature", labelCol="Depressed", maxIter =1000).fit(df_train)

predictResult = model.transform(df_test)

In [39]:
predictResult.select("Depressed","prediction").show(10)

+---------+----------+
|Depressed|prediction|
+---------+----------+
|        0|       0.0|
|        0|       1.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
|        1|       0.0|
|        1|       1.0|
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
+---------+----------+
only showing top 10 rows



In [40]:
evaluator = BinaryClassificationEvaluator(labelCol ="Depressed")
accuracy = evaluator.evaluate(predictResult)

print(f"Accuracy: {accuracy * 100}%")

Accuracy: 88.0983871355141%
