## PYSPARK

### IMPORT LIBRARY FROM PYSPARK 

In [107]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

### INIT SESSION SPARK 

In [108]:
# Inisialisasi sesi Spark
spark = SparkSession.builder.appName("appbg").getOrCreate()

In [109]:
spark

### Merge Data

In [110]:
# Baca file CSV pertama (data-user.csv)
df_user = spark.read.csv("file:///_script\_csv\data-user.csv", header=True, inferSchema=True)

# Baca file CSV kedua (plat.csv)
df_plat = spark.read.csv("file:///_script\_csv\data-plates.csv", header=True, inferSchema=True)

##### Cleansing

In [111]:
df_user = df_user.withColumnRenamed("_id", "id")
df_user = df_user.withColumnRenamed("plateNumber", "plate")

df_plat = df_plat.withColumnRenamed("_id", "id")

In [112]:
df_user.show(5)

+--------------------+------------+--------------------+--------------------+--------+-----+--------+-------------------+--------------------+
|                  id|    username|               email|      profilePicture|hourStay| cost|   plate|        phoneNumber|           createdAt|
+--------------------+------------+--------------------+--------------------+--------+-----+--------+-------------------+--------------------+
|6640195b43df89dc0...|Well-placeda|bernita99@hotmail...|https://picsum.ph...|       4|20000|B4W3T9S1|(861) 213-9322 x777|2024-05-12 01:20:...|
|6640195b43df89dc0...|Duskysuperhi|felicita42@yahoo.com|https://picsum.ph...|       7|35000|B8C2X4V5|       794-485-6055|2024-05-12 01:20:...|
|6640195c43df89dc0...|Rapidspatula|macie.kemmer29@ya...|https://picsum.ph...|       2|10000|B7J8Z5T2|1-700-810-0785 x142|2024-05-12 01:20:...|
|6640195c43df89dc0...|Riotousgamet|cordia30@hotmail.com|https://picsum.ph...|       4|20000|B6J1W4T9|     (287) 811-3914|2024-05-12 01:20:...|

In [113]:
df_plat.show(5)

+--------------------+--------+----------+
|                  id|   plate|    source|
+--------------------+--------+----------+
|6646db6ff89c18be7...|B3O23KEZ|motorcycle|
|6646db6ff89c18be7...|B3A4R5U2|motorcycle|
|6646db6ff89c18be7...|B8C2X4V5|motorcycle|
|6646db6ff89c18be7...|B1M3U8R7|motorcycle|
|6646db6ff89c18be7...|B5M2W3C9|motorcycle|
+--------------------+--------+----------+
only showing top 5 rows



In [114]:
df_merge = df_user.join(df_plat, "plate")

In [115]:
# Menampilkan hasil data yang sudah di gabungkan 
df_merge.show(5)

+--------+--------------------+------------+--------------------+--------------------+--------+-----+-------------------+--------------------+--------------------+----------+
|   plate|                  id|    username|               email|      profilePicture|hourStay| cost|        phoneNumber|           createdAt|                  id|    source|
+--------+--------------------+------------+--------------------+--------------------+--------+-----+-------------------+--------------------+--------------------+----------+
|B4W3T9S1|6640195b43df89dc0...|Well-placeda|bernita99@hotmail...|https://picsum.ph...|       4|20000|(861) 213-9322 x777|2024-05-12 01:20:...|6646db6ff89c18be7...|motorcycle|
|B8C2X4V5|6640195b43df89dc0...|Duskysuperhi|felicita42@yahoo.com|https://picsum.ph...|       7|35000|       794-485-6055|2024-05-12 01:20:...|6646db6ff89c18be7...|motorcycle|
|B7J8Z5T2|6640195c43df89dc0...|Rapidspatula|macie.kemmer29@ya...|https://picsum.ph...|       2|10000|1-700-810-0785 x142|2024

### SPARK MACHINE LEARNING

In [116]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

##### Vektor

In [117]:
# Membuat vektor fitur
assembler = VectorAssembler(inputCols=["hourStay"], outputCol="features")
df_merge = assembler.transform(df_merge)

##### Preprocessing

In [118]:
# Bagi data menjadi set pelatihan dan set pengujian
(train_data, test_data) = df_merge.randomSplit([0.8, 0.2], seed=42)

In [119]:
# Inisialisasi model Regresi Linear
lr = LinearRegression(featuresCol="features", labelCol="cost")

In [120]:
# Latih model
lr_model = lr.fit(train_data)

##### Evaluate model

In [121]:
# Prediksi pada set pengujian
predictions = lr_model.transform(test_data)

In [122]:
# Evaluasi model menggunakan Mean Squared Error
evaluator = RegressionEvaluator(labelCol="cost", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 5.307742263881927e-22


##### Test result 

In [123]:
# Prediksi pada set pengujian
predictions = lr_model.transform(test_data)

# Menyimpan hasil prediksi ke dalam DataFrame baru
predictions_df = predictions.select("plate", "hourStay", "cost", "prediction")

# Menampilkan hasil prediksi
predictions_df.show()

+--------+--------+-----+------------------+
|   plate|hourStay| cost|        prediction|
+--------+--------+-----+------------------+
|B1H9V5R8|       3|15000| 14999.99999999998|
|B1S5R2M9|       3|15000| 14999.99999999998|
|B1S8U5R4|      10|50000| 50000.00000000004|
|B2R7W9J4|       2|10000|  9999.99999999997|
|B2Z8R5V7|       1| 5000| 4999.999999999963|
|B3K5H8Z7|       6|30000|30000.000000000007|
|B4B7X2J6|      10|50000| 50000.00000000004|
|B4K7N1S8|       7|35000|35000.000000000015|
|B5F2J9X6|       7|35000|35000.000000000015|
|B5F3V2M8|       6|30000|30000.000000000007|
|B5H1W3Z9|       6|30000|30000.000000000007|
|B5M2W3C9|       5|25000|24999.999999999996|
|B5Z3X6J4|       7|35000|35000.000000000015|
|B6F3C8S1|       9|45000| 45000.00000000003|
|B6P2Z4U1|       3|15000| 14999.99999999998|
|B6X2R9C1|       1| 5000| 4999.999999999963|
|B8F3Z9W7|       7|35000|35000.000000000015|
|B9X2N5S1|       7|35000|35000.000000000015|
+--------+--------+-----+------------------+

