<a href="https://colab.research.google.com/github/ilya-lykov/google_colab_labs/blob/main/3_lab/Laba_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F
from pyspark.sql import Window as W
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from functools import reduce
from pyspark.ml import Pipeline

import matplotlib.pyplot as plt

In [2]:
spark = (SparkSession
    .builder
    .appName("Test")
    .getOrCreate()
)

In [3]:
acc = spark.read.csv("/content/drive/MyDrive/University/data/2014/DfTRoadSafety_Accidents_2014.csv", header=True, inferSchema=True)
veh = spark.read.csv("/content/drive/MyDrive/University/data/2014/DfTRoadSafety_Vehicles_2014.csv", header=True, inferSchema=True)

In [17]:
df = acc.join(veh, ["Accident_Index"], "inner")

In [13]:
columns = [
    "Age_of_Vehicle",
    "Speed_limit",
    "Engine_Capacity_(CC)",
    "Number_of_Vehicles",
    "Number_of_Casualties",
    "Weather_Conditions",
    "Light_Conditions",
    "Road_Surface_Conditions",
    "Vehicle_Type",
    "Vehicle_Manoeuvre",
    "1st_Point_of_Impact",
    "Sex_of_Driver",
    "Accident_Severity",
    "Age_of_Driver",
    "Age_Band_of_Driver",
    "Urban_or_Rural_Area",
    "Vehicle_Reference",
    "Skidding_and_Overturning",
    "Pedestrian_Crossing-Human_Control",
    "Pedestrian_Crossing-Physical_Facilities",
    "2nd_Road_Class",
    "1st_Road_Class",
    "Road_Type"
    ]



filter_condition = reduce(
    lambda acc, column: acc & ((F.col(column) > 0) & F.col(column).isNotNull()),
    columns[1:],
    (F.col(columns[0]) > 0) & F.col(columns[0]).isNotNull()
)

accidents = df.select(columns).filter(filter_condition)

In [14]:
categorical_columns = [
    "Weather_Conditions",
    "Light_Conditions",
    "Road_Surface_Conditions",
    "Vehicle_Type",
    "Vehicle_Manoeuvre",
    "1st_Point_of_Impact",
    "Sex_of_Driver",
    "Speed_limit",
    "Urban_or_Rural_Area",
    "Engine_Capacity_(CC)",
    "Vehicle_Reference",
    "Pedestrian_Crossing-Human_Control",
    "Pedestrian_Crossing-Physical_Facilities",
    "2nd_Road_Class",
    "1st_Road_Class",
    "Road_Type"
    ]

numercical_columns = [
    "Age_of_Vehicle",
    "Age_Band_of_Driver",
    "Age_of_Driver",
]

encoders = [
    OneHotEncoder(inputCol=col, outputCol=f"{col}_OneHot")
    for col in categorical_columns
]

pipeline = Pipeline(stages = encoders)
accidents_encoded = pipeline.fit(accidents).transform(accidents)

In [15]:
feature_columns = [f"{col}_OneHot" for col in categorical_columns] + numercical_columns

accidents_accembler = VectorAssembler(inputCols=feature_columns, outputCol="features").transform(accidents_encoded)

In [18]:
labels=[
    "Number_of_Vehicles",
    "Number_of_Casualties",
    "Accident_Severity",
    "Skidding_and_Overturning"
]

def getLr(feature,label):
  lr = (LinearRegression(featuresCol=feature, labelCol=label)
  .fit(accidents_accembler))
  print(lr)
  print(f"lr.intercept: {lr.intercept}")
  lr=lr.transform(accidents_accembler)
  evaluator = RegressionEvaluator(labelCol=label, predictionCol="prediction", metricName="r2")
  r2 = evaluator.evaluate(lr)
  print(f"for column: {label}  R2: {r2}")
  lr.select(label, "prediction").show(truncate=False)


for label in labels:
  getLr("features", label)

LinearRegressionModel: uid=LinearRegression_baf931938759, numFeatures=2644
lr.intercept: 3.392893397514974
for column: Number_of_Vehicles  R2: 0.9999999999999921
+------------------+------------------+
|Number_of_Vehicles|prediction        |
+------------------+------------------+
|2                 |1.9999999302221099|
|1                 |1.000000122305173 |
|2                 |2.000000006245931 |
|2                 |1.9999998578108933|
|2                 |1.9999999127082577|
|2                 |2.000000026413287 |
|2                 |2.0000000153208926|
|2                 |1.9999999413709122|
|1                 |1.0000000149403387|
|2                 |2.0000000712078507|
|2                 |1.9999999414845218|
|5                 |4.999999963695359 |
|2                 |1.9999999644450013|
|2                 |1.9999999163455437|
|2                 |1.9999999351484978|
|2                 |2.000000101418289 |
|2                 |1.9999999906905552|
|2                 |2.000000008086849 