# Machine Learning Predictive Maintenance

In [13]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder,  MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

Jean-Sebastien

## Load Data

In [2]:
spark = SparkSession.builder.getOrCreate()
datapath_18 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2018.csv"
df_18 = spark.read.csv(datapath_18, header=True, inferSchema=True)
datapath_19 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2019.csv"
df_19 = spark.read.csv(datapath_19, header=True, inferSchema=True)
datapath_20 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2020.csv"
df_20 = spark.read.csv(datapath_20, header=True, inferSchema=True)
datapath_21 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2021.csv"
df_21 = spark.read.csv(datapath_21, header=True, inferSchema=True)
datapath_22 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2022.csv"
df_22 = spark.read.csv(datapath_22, header=True, inferSchema=True)
datapath_air = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Airlines.csv"
df_airlines = spark.read.csv(datapath_air, header=True, inferSchema=True)
df_all = df_18.union(df_19).union(df_20).union(df_21).union(df_22)

                                                                                

## PreProcessing

### Regression

In [16]:
flight_hours_threshold_low = 400
flight_hours_threshold_high = 600
maintenance_interval_low = 200
maintenance_interval_high = 300
selected_columns = ['FlightDate','Tail_Number','ActualElapsedTime', 'DepDelayMinutes', 'ArrDelayMinutes', 'AirTime', 'Distance', 'DepDel15', 'TaxiOut', 'TaxiIn', 'CRSArrTime', 'Cancelled']
df = df_all.select(selected_columns)
df = df.na.drop()
df = df.withColumn('DepDel15', df['DepDel15'].cast('int'))
df = df.withColumn('LateArrival', (df['ArrDelayMinutes'] > 15).cast('int'))
df = df.withColumn('Tail_Number', F.when(df['Tail_Number'].isNull(), 'Unknown').otherwise(df['Tail_Number']))
string_indexer = StringIndexer(inputCol='Tail_Number', outputCol='Tail_Number_Index')
df = string_indexer.fit(df).transform(df)
df = df.withColumn('FlightHours', col('ActualElapsedTime') / 60.0) 
window_spec = Window().partitionBy('Tail_Number').orderBy('FlightDate')
df = df.withColumn('CumulativeFlightHours', sum('FlightHours').over(window_spec))
df = df.withColumn(
    'AdjustedMaintenanceInterval',
    when(df['Cancelled'] == 1, 0).otherwise(
        when(df['DepDelayMinutes'] > 0, df['CumulativeFlightHours'] % maintenance_interval_low).otherwise(
            df['CumulativeFlightHours'] % maintenance_interval_high
        )
    )
)
df = df.withColumn(
    'HoursBeforeMaintenance',
    when(
        ((df['CumulativeFlightHours'] + df['FlightHours'] >= flight_hours_threshold_low) & (df['CumulativeFlightHours'] + df['FlightHours'] <= flight_hours_threshold_high)) |
        (df['AdjustedMaintenanceInterval'] == 0),
        0
    ).otherwise(
        when(df['AdjustedMaintenanceInterval'] < maintenance_interval_low, maintenance_interval_low).otherwise(df['AdjustedMaintenanceInterval'])
    )
)
df = df.withColumn('TotalTaxiTime', df['TaxiOut'] + df['TaxiIn'])
df = df.withColumn('ScheduledArrivalDelay', df['CRSArrTime'] - (df['DepDelayMinutes'] + df['AirTime']))
df = df.withColumn('DistancePerMinute', df['Distance'] / (df['AirTime'] + 1))  # Adding 1 to avoid division by zero
df.select('Tail_Number', 'FlightDate', 'ActualElapsedTime', 'Cancelled', 'DepDelayMinutes', 'ArrDelayMinutes', 'HoursBeforeMaintenance').show(10)


[Stage 45:>                                                         (0 + 1) / 1]

+-----------+----------+-----------------+---------+---------------+---------------+----------------------+
|Tail_Number|FlightDate|ActualElapsedTime|Cancelled|DepDelayMinutes|ArrDelayMinutes|HoursBeforeMaintenance|
+-----------+----------+-----------------+---------+---------------+---------------+----------------------+
|      219NV|2018-01-01|            161.0|    false|            0.0|            0.0|                 200.0|
|      219NV|2018-01-01|            181.0|    false|            0.0|            9.0|                 200.0|
|      219NV|2018-01-01|            135.0|    false|            0.0|            0.0|                 200.0|
|      219NV|2018-01-01|            132.0|    false|            0.0|            3.0|                 200.0|
|      219NV|2018-01-02|            124.0|    false|            0.0|            4.0|                 200.0|
|      219NV|2018-01-02|            124.0|    false|            4.0|           10.0|                 200.0|
|      219NV|2018-01-02|    

                                                                                

In [17]:
numerical_columns = ['DepDelayMinutes', 'ArrDelayMinutes', 'TotalTaxiTime', 'ScheduledArrivalDelay', 'DistancePerMinute', 'Tail_Number_Index', 'CumulativeFlightHours', 'FlightHours', 'AdjustedMaintenanceInterval', 'HoursBeforeMaintenance']
categorical_columns = ['DepDel15', 'Cancelled']
assembler_numerical = VectorAssembler(inputCols=numerical_columns, outputCol='numerical_features')
scaler = MinMaxScaler(inputCol='numerical_features', outputCol='scaled_numerical_features')
encoders = [OneHotEncoder(inputCol=col, outputCol=f'{col}_encoded') for col in categorical_columns]
assembler_all = VectorAssembler(
    inputCols=[f'{col}_encoded' for col in categorical_columns] + ['scaled_numerical_features', 'HoursBeforeMaintenance'],
    outputCol='features'
)
pipeline = Pipeline(stages=[assembler_numerical, scaler] + encoders + [assembler_all])
df = df.withColumn('Cancelled', col('Cancelled').cast('int'))
transformed_df = pipeline.fit(df).transform(df)
transformed_df = transformed_df.select(['features', 'HoursBeforeMaintenance'])
transformed_df.printSchema()
transformed_df.show(5)


                                                                                

root
 |-- features: vector (nullable = true)
 |-- HoursBeforeMaintenance: double (nullable = true)



[Stage 58:>                                                         (0 + 1) / 1]

+--------------------+----------------------+
|            features|HoursBeforeMaintenance|
+--------------------+----------------------+
|[1.0,0.0,0.0,0.08...|                 200.0|
|[1.0,0.0,0.001244...|                 200.0|
|[1.0,0.0,0.0,0.03...|                 200.0|
|[1.0,0.0,4.148230...|                 200.0|
|[1.0,0.0,5.530973...|                 200.0|
+--------------------+----------------------+
only showing top 5 rows



                                                                                

In [None]:


train_data, test_data = transformed_df.randomSplit([0.8, 0.2], seed=42)
lr = LinearRegression(labelCol='HoursBeforeMaintenance', featuresCol='features', maxIter=10)
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=param_grid,
                          evaluator=RegressionEvaluator(labelCol='HoursBeforeMaintenance'),
                          numFolds=3)

pipeline = Pipeline(stages=[crossval])

model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(labelCol='HoursBeforeMaintenance', metricName='rmse')
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")
