In [None]:
# Download required Libraries 
!pip install pandas numpy seaborn matplotlib scipy scikit-learn openpyxl

In [2]:
# Importing Libraries
import pandas as pd 
import numpy as np 
import seaborn as sns   
import matplotlib.pyplot as plt 

In [132]:
# Reading csv files for data-cleaning
df_sensor = pd.read_csv('sensor_readings.csv')
df_maintenance = pd.read_csv('maintenance_logs.csv')
df_machine_excel = pd.read_excel('machine_metadata.xlsx')

In [None]:
# Checking the Sensor dataset overview
df_sensor.head()

In [None]:
# Checking the Maintenance dataset overview
df_maintenance.head()

In [None]:
# Checking the Machine dataset overview
df_machine.head

In [None]:
# Checking the dtypes of Machine dataset overview
df_sensor.dtypes

In [223]:
# Convert Timestamp to datetime and extract date
df_sensor['Timestamp'] = pd.to_datetime(df_sensor['Timestamp'])
df_sensor['Date'] = df_sensor['Timestamp'].dt.date

In [None]:
# Now Checking again the dtypes of Machine dataset after converting to date
df_sensor.dtypes

In [None]:
# Group by date and calculate daily averages
daily_avg = cleaned_sensor_readings.groupby('Date').agg({
    'Temperature (°C)': 'mean',
    'Vibration (mm/s)': 'mean',
    'Pressure (psi)': 'mean',
    'RuntimeHours': 'mean'
}).reset_index()

print(daily_avg)

In [None]:
# checking how many missing value in sensor dataset
df_sensor.isna().sum()

In [None]:
# Handling missing value in sensor dataset with median
df_sensor['Vibration (mm/s)'].fillna(df_sensor['Vibration (mm/s)'].median(),inplace=True)

In [37]:
# Detecting outliers using IQR 
def detect_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1 
    lower = Q1-1.5*IQR 
    higher = Q3+1.5*IQR 
    return series[(series>higher)|(series<lower)]

In [38]:
# detecting outliers in Temperature column 
temp_outlier = detect_outliers(df_sensor['Temperature (°C)'])

In [None]:
# Checking How many outlier in temperature column
print({"Outlier_In_Temperature": len(temp_outlier)})

In [40]:
# Now Removing Outliers from temperature column
def remove_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1 
    lower = Q1-1.5*IQR 
    higher = Q3+1.5*IQR 
    return series[(series>higher)|(series<lower)].index

In [41]:
# Removing detected outlier in temperature column
outlier_removed = remove_outliers(df_sensor['Temperature (°C)'])

In [56]:
# Removing detected outlier in temperature column
cleaned_sensor_readings = df_sensor.drop(outlier_removed)

In [59]:
# converting old dataset to cleaned dataset
cleaned_sensor_readings.to_csv('cleaned_sensor_readings.csv',index=False)

In [None]:
# Checking the dtypes of Maintenance dataset 
df_maintenance.dtypes

In [98]:
# Coverting dtype of date in maintenance dataset

# Define the function to try multiple date formats
def try_parse_dates(date_str):
    date_formats = ['%d-%m-%Y', '%Y-%m-%d', '%Y/%m/%d']
    for fmt in date_formats:
        try:
            parsed_date = pd.to_datetime(date_str, format=fmt, errors='coerce')
            if pd.notna(parsed_date):
                return parsed_date
        except ValueError:
            continue
    return pd.NaT

# Apply the function and ensure dtype conversion
df_maintenance['Date'] = df_maintenance['Date'].apply(try_parse_dates)

# Explicitly convert to datetime64[ns] to enforce dtype
df_maintenance['Date'] = pd.to_datetime(df_maintenance['Date'], errors='coerce')

In [None]:
# Now Checking again the dtypes of Maintenance dataset 
df_maintenance.dtypes

In [None]:
# checking is there any missing value in maintenance dataset
df_maintenance.isna().sum()

In [None]:
# Handling missing values of RepairType in Maintenance dataset
df_maintenance['RepairType'].fillna(df_maintenance['RepairType'].mode()[0],inplace=True)

In [112]:
# converting old dataset to cleaned dataset
df_maintenance.to_csv('cleaned_maintenance_logs.csv',index=False)

In [None]:
# Checking the dtypes of Machine dataset 
df_machine_excel.dtypes

In [None]:
# checking is there any missing value in machine dataset
df_machine_excel.isna().sum()

In [16]:
# Getting days_since_last_overhaul feature using python

# Merge with sensor readings
df_merge = cleaned_sensor_readings.merge(df_machine_excel, on="MachineID", how="left")

# Calculate 'DaysSinceOverhaul'
df_merge["DaysSinceOverhaul"] = (df_merge["Date"] - df_merge["LastOverhaulDate"]).dt.days


In [151]:
# changing dtype of Date from object to datetime
cleaned_sensor_readings['Date'] = pd.to_datetime(cleaned_sensor_readings['Date'],errors='coerce')

In [None]:
# Checking the merge dataset to see DaysSinceOverhaul created
df_merge.head()

In [55]:
# importing sqlite3 and making connection for database
import sqlite3, csv 
con = sqlite3.connect('Maintenance.db')
curr = con.cursor()

In [None]:
# Install pretty table library for to show tables when executing sql query
!pip install ipython-sql pretty
import prettytable
prettytable.DEFAULT = 'DEFAULT'

In [None]:
#  loading the database
%load_ext sql

In [110]:
# Creating Connection to Database
%sql sqlite:///Maintenance.db

In [None]:
# Reading cleaned csv to sql and assigning name for table like Sensor Data
cleaned_sensor_readings.to_sql('Sensor_Data',con,if_exists='replace',index=False)

In [None]:
# Reading cleaned csv to sql and assigning name for table like Maintenance Data
cleaned_maintenance_logs.to_sql('Maintenance_Data',con,if_exists='replace',index=False)

In [None]:
# Reading cleaned csv to sql and assigning name for table like Maintenance Data
df_machine_excel.to_sql('Machine_Data',con,if_exists='replace',index=False)

In [None]:
# Getting rolling averages of sensor feature using sql
df_merge_sql = %sql SELECT *, \
                    (JULIANDAY(s.Date) - JULIANDAY(m.LastOverhaulDate)) AS DaysSinceOverhaul, \
                    AVG(s.Temperature) OVER ( \
                        PARTITION BY s.MachineID \
                        ORDER BY s.Timestamp \
                        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW \
                    ) AS Temp_3DayAvg, \
                    AVG(s.Vibration) OVER ( \
                        PARTITION BY s.MachineID \
                        ORDER BY s.Timestamp \
                        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW \
                    ) AS Vib_3DayAvg, \
                    AVG(s.Pressure) OVER ( \
                        PARTITION BY s.MachineID \
                        ORDER BY s.Timestamp \
                        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW \
                    ) AS Pres_3DayAvg \
                FROM Sensor_Data s \
                JOIN Machine_Data m ON s.MachineID = m.MachineID \
                ORDER BY s.MachineID, s.Timestamp

In [112]:
df_merge_sql = df_merge_sql.DataFrame()

In [None]:
df_merge_sql.head()

In [20]:
df_merge = df_merge.rename(columns={'Temperature (°C)':'Temperature','Vibration (mm/s)':'Vibration','Pressure (psi)':'Pressure'})

In [21]:
# Getting rolling averages of sensor feature using python
df_merge = df_merge.sort_values(by=["MachineID", "Timestamp"])

# Define rolling window (3 observations here, assuming daily or near-daily frequency)
df_merge["Temp_3DayAvg"] = df_merge.groupby("MachineID")["Temperature"]\
                                   .transform(lambda x: x.rolling(window=3, min_periods=1).mean())

df_merge["Vib_3DayAvg"] = df_merge.groupby("MachineID")["Vibration"]\
                                  .transform(lambda x: x.rolling(window=3, min_periods=1).mean())

df_merge["Pres_3DayAvg"] = df_merge.groupby("MachineID")["Pressure"]\
                                   .transform(lambda x: x.rolling(window=3, min_periods=1).mean())

In [None]:
df_merge.head()

In [23]:
#converting dtype of Date column in maintenance dataset to datetime and also filter out the failure = Y from maintenance dataset
cleaned_maintenance_logs["Date"] = pd.to_datetime(cleaned_maintenance_logs["Date"])
df_maintenance_failure = cleaned_maintenance_logs[cleaned_maintenance_logs["Failure"] == "Y"]

In [25]:
from datetime import timedelta

def check_failure_within_30_days(row):
    machine_id = row["MachineID"]
    current_date = row["Date"]
    future_failures = df_maintenance_failure[
        (df_maintenance_failure["MachineID"] == machine_id) &
        (df_maintenance_failure["Date"] > current_date) &
        (df_maintenance_failure["Date"] <= current_date + timedelta(days=30))
    ]
    return 1 if not future_failures.empty else 0

df_merge["FailureNext30Days"] = df_merge.apply(check_failure_within_30_days, axis=1)


In [26]:
from sklearn.model_selection import train_test_split

# Features to use (update this list with your final engineered features)
feature_cols = ["Temperature", "Vibration", "Pressure", "RuntimeHours",
                "DaysSinceOverhaul", "Temp_3DayAvg", "Vib_3DayAvg", "Pres_3DayAvg"]

X = df_merge[feature_cols]
y = df_merge["FailureNext30Days"]

# Drop rows with any remaining NaNs
X = X.dropna()
y = y.loc[X.index]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Predictions
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Metrics
print("AUC-ROC:", roc_auc_score(y_test, y_prob))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr, label="ROC Curve (AUC = {:.2f})".format(roc_auc_score(y_test, y_prob)))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create DataFrame of features and their importances
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="viridis")

# Add importance values on each bar
for i, (importance, feature) in enumerate(zip(feature_importance["Importance"], feature_importance["Feature"])):
    ax.text(importance + 0.0001, i, f"{importance:.3f}", va='center', ha='left', fontsize=9)

plt.title("Random Forest Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [68]:
# Make sure FailureProbability is already added
df_sorted = df_merge.sort_values(["MachineID", "Timestamp"], ascending=[True, False])

# Keep only the latest record per machine
latest_status = df_sorted.groupby("MachineID").first().reset_index()


In [69]:
high_risk = latest_status[latest_status["FailureProbability"] > 0.7].copy()


In [70]:
from datetime import timedelta
import numpy as np

high_risk["RecommendedMaintenanceDate"] = high_risk["Date"] + high_risk["FailureProbability"].apply(
    lambda p: timedelta(days=int(np.interp(p, [0.7, 1.0], [7, 14])))
)


In [71]:
maintenance_recommendations = high_risk[[
    "MachineID", "Date", "FailureProbability", "RecommendedMaintenanceDate"
]].sort_values(by="FailureProbability", ascending=False)

maintenance_recommendations["FailureProbability"] = maintenance_recommendations["FailureProbability"].round(3)
maintenance_recommendations["RecommendedMaintenanceDate"] = maintenance_recommendations["RecommendedMaintenanceDate"].dt.date


In [None]:
maintenance_recommendations.head()