In [None]:
# Import all libraries
!pip install seaborn
!pip install pandas
!pip install plotly
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from scipy import stats
from io import StringIO
import time


In [None]:
# Import dataset

df = pd.read_csv("vehicle_maintenance_data.csv")

# Confirm dataset loaded
df.head()

## **Data Preprocessing**

In [None]:
# Dataset details and summary statistics

print(df.info())

# Categorical Summary Statistics
print(f'Categorical Summary Statistics \ndf.describe(include= object).T\n')

# Numerical Summary Statistics
print(f'Numerical Summary Statistics \ndf.describe(include= np.number).T\n')

# Isolate numerical variables
numeric_df = df.select_dtypes(include=np.number)

# Isolate categorical variables
categorical_df = df.select_dtypes(include=object)




In [None]:
# Confirm outliers for Numerical variables
def highlight_outliers(s):
    '''
    Highlight values that are outside of 3 standard deviations from the mean.
    '''
    is_outlier = (np.abs(s - s.mean()) > 3 * s.std())
    return ['background-color: red' if v else '' for v in is_outlier]

# Apply function
numeric_df = df.select_dtypes(include=np.number)
styled_df = numeric_df.describe().T.style.apply(highlight_outliers)
styled_df


# **Univariate Analysis**

In [None]:
# Distribution of numeric variables

import plotly.subplots as sp
import plotly.graph_objects as go

# isolating numeric variables
numeric_df = df.select_dtypes(include=np.number)

for column in numeric_df.columns:
    # Summary statistics
    print(f"Summary Statistics for {column}:\n{numeric_df[column].describe()}\n")

    # Create subplots
    fig = sp.make_subplots(rows=1, cols=2, subplot_titles=(f"Histogram of {column}", f"Box Plot of {column}"))

    # Histogram
    fig.add_trace(go.Histogram(x=numeric_df[column]), row=1, col=1)

    # Box plot
    fig.add_trace(go.Box(y=numeric_df[column]), row=1, col=2)

    fig.update_layout(height=600, width=1000, title_text=f"Plots for {column}", showlegend=False)
    fig.show()

In [None]:
# Distribution of Categorical variables
import pandas as pd
import plotly.express as px

# Assuming 'df' is your DataFrame
categorical_df = df.select_dtypes(include=object)

for column in categorical_df.columns:
    # Value counts and percentages
    value_counts = categorical_df[column].value_counts()
    percentages = (value_counts / len(categorical_df) * 100).round(2)
    print(f"Value Counts and Percentages for {column}:\n{value_counts}\n{percentages}\n")

    # Bar plot
    fig = px.bar(
        x=value_counts.index,
        y=value_counts.values,
        title=f"Bar Plot of {column}",
        labels={"x": column, "y": "Count"},
        text=percentages.astype(str) + "%",  # Display percentages on bars
    )
    fig.update_traces(textposition="outside")  # Position percentages outside bars
    fig.show()

# **Correlation Analysis**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Load your DataFrame
df = pd.read_csv('vehicle_maintenance_data.csv')  # Uncomment and load your data

# Calculate correlation matrix, considering only numeric columns
correlation_matrix = df.corr(numeric_only=True)  # Added numeric_only=True

# Create heatmap
plt.figure(figsize=(12, 8))  # Adjust figure size if needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Vehicle Maintenance Dataset")
plt.show()

# Find top 5 and bottom 5 correlated feature pairs
correlation_pairs = correlation_matrix.unstack().reset_index()
correlation_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
correlation_pairs = correlation_pairs[correlation_pairs['Feature 1'] != correlation_pairs['Feature 2']]  # Remove self-correlation
correlation_pairs = correlation_pairs.sort_values(by='Correlation', ascending=False)

# Top 5 correlations
top_5 = correlation_pairs.head(5)
# Bottom 5 correlations
bottom_5 = correlation_pairs.tail(5)

# Combine top and bottom correlations
correlation_summary = pd.concat([top_5, bottom_5])

# Create Plotly table
table = go.Figure(data=[go.Table(
    header=dict(values=list(correlation_summary.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[correlation_summary['Feature 1'], correlation_summary['Feature 2'], correlation_summary['Correlation']],
               fill_color='lavender',
               align='left'))
])

table.update_layout(title='Top 5 and Bottom 5 Correlations in Vehicle Maintenance Dataset')
table.show()


In [None]:
!pip install researchpy
import researchpy as rp
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import chi2_contingency

# Isolate the categorical variables
categorical_df = df.select_dtypes(include=['object', 'category'])  # Select only categorical columns

# Create an empty list to store the results of chi-square and Cramer's V calculations
results = []

# Iterate through pairs of categorical variables
for col1 in categorical_df.columns:
    for col2 in categorical_df.columns:
        if col1 != col2:  # Avoid comparing a column with itself
            # Create a cross-tabulation of the two variables
            crosstab = pd.crosstab(df[col1], df[col2])

            # Perform the chi-square test on the crosstab
            chi2, p, dof, expected = chi2_contingency(crosstab)

            # Calculate Cramer's V for association strength
            n = crosstab.sum().sum()  # Total number of observations
            cramer_v = (chi2 / (n * (min(crosstab.shape) - 1))) ** 0.5

            # Append the results to the list
            results.append({
                'col1': col1,
                'col2': col2,
                'chi-square': chi2,
                'p-value': p,
                'cramer_v': cramer_v
            })

# Convert the results into a DataFrame for easier manipulation
results_df = pd.DataFrame(results)

# Create subplots for heatmaps and annotations for each pair of categorical variables
fig = make_subplots(
    rows=len(results_df),
    cols=1,
    subplot_titles=[f"{row['col1']} vs {row['col2']}" for _, row in results_df.iterrows()]
)

# Add heatmaps and annotations to each subplot
for i, row in results_df.iterrows():
    # Generate the crosstab for the current pair of variables
    crosstab = pd.crosstab(df[row['col1']], df[row['col2']])

    # Create a heatmap for the crosstab
    heatmap = go.Heatmap(
        z=crosstab.values,
        x=crosstab.columns,
        y=crosstab.index,
        colorscale='Viridis',
        colorbar=dict(title='Count')
    )

    fig.add_trace(heatmap, row=i + 1, col=1)

    # Add annotations for chi-square test results and Cramer's V
    fig.add_annotation(
        text=f"Chi-square: {row['chi-square']:.3f}<br>P-value: {row['p-value']:.3f}<br>Cramer's V: {row['cramer_v']:.3f}",
        xref="paper", yref="paper",
        x=0.05, y=0.95 - i * 0.1,  # Adjust position for each subplot
        showarrow=False
    )

# Update the layout and display the figure
fig.update_layout(
    height=400 * len(results_df),  # Set the height dynamically based on the number of subplots
    width=800,
    title_text="Chi-Square Test Results Between Categorical Variables"
)

# Show the plot
fig.show()


# **Chi-Squared Analysis**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# Function to map Need_Maintenance
def map_maintenance(dataframe):
    return dataframe['Need_Maintenance'].map({1: 'Yes', 0: 'No'})

# Load your DataFrame
df = pd.read_csv('vehicle_maintenance_data.csv') 

# First Analysis: Chi-Square Tests for Tire and Brake Conditions

# Create a copy of the original DataFrame for the first analysis
df_analysis_1 = df.copy()
df_analysis_1['Need_Maintenance'] = map_maintenance(df_analysis_1)

# Focus on the relevant columns for the first analysis
relevant_columns = ['Tire_Condition', 'Brake_Condition', 'Need_Maintenance']
df_filtered = df_analysis_1[relevant_columns]

# Check for missing values in relevant columns
missing_values = df_filtered.isnull().sum()
print("Missing Values in Analysis 1:\n", missing_values)

# Drop rows with any missing values in the relevant columns
df_filtered = df_filtered.dropna()

# Check if the filtered DataFrame is empty
if df_filtered.empty:
    print("Filtered DataFrame is empty after dropping missing values in Analysis 1.")
else:
    # Chi-Square Test for Tire_Condition and Need_Maintenance
    tire_crosstab = pd.crosstab(df_filtered['Tire_Condition'], df_filtered['Need_Maintenance'])
    chi2_tire, p_tire, dof_tire, expected_tire = chi2_contingency(tire_crosstab)

    # Chi-Square Test for Brake_Condition and Need_Maintenance
    brake_crosstab = pd.crosstab(df_filtered['Brake_Condition'], df_filtered['Need_Maintenance'])
    chi2_brake, p_brake, dof_brake, expected_brake = chi2_contingency(brake_crosstab)

    # Output the results of the chi-square tests
    print(f"Tire Condition vs Need Maintenance: Chi-square = {chi2_tire:.3f}, p-value = {p_tire:.3f}")
    print(f"Brake Condition vs Need Maintenance: Chi-square = {chi2_brake:.3f}, p-value = {p_brake:.3f}")

    # Visualization 1: Tire Condition vs Need Maintenance
    plt.figure(figsize=(10, 5))
    sns.countplot(x='Tire_Condition', hue='Need_Maintenance', data=df_filtered, palette='viridis')
    plt.title('Influence of Tire Condition on Maintenance Needs')
    plt.xlabel('Tire Condition')
    plt.ylabel('Count of Vehicles')
    plt.legend(title='Need Maintenance', loc='upper right')
    plt.show()

    # Visualization 2: Brake Condition vs Need Maintenance
    plt.figure(figsize=(10, 5))
    sns.countplot(x='Brake_Condition', hue='Need_Maintenance', data=df_filtered, palette='plasma')
    plt.title('Influence of Brake Condition on Maintenance Needs')
    plt.xlabel('Brake Condition')
    plt.ylabel('Count of Vehicles')
    plt.legend(title='Need Maintenance', loc='upper right')
    plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve, roc_auc_score, accuracy_score,
                             precision_score, recall_score, f1_score)
from sklearn.preprocessing import LabelEncoder

# Load your DataFrame
df = pd.read_csv('vehicle_maintenance_data.csv') 

# Check for any missing values in the original column
print("Initial unique values in 'Need_Maintenance':", df['Need_Maintenance'].unique())

# Impute 'Need_Maintenance' to be categorical (1 = Yes, 0 = No)
df['Need_Maintenance'] = df['Need_Maintenance'].map({1: 1, 0: 0})  # Keep it numeric

# Check for any NaN values after mapping
print("Unique values after mapping:", df['Need_Maintenance'].unique())
if df['Need_Maintenance'].isnull().any():
    print("NaN values found in 'Need_Maintenance' after mapping, check your data.")

# Check for and drop any rows with NaN values
df.dropna(inplace=True)

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Define features (X) and target (y)
X = df.drop(columns=['Need_Maintenance'])
y = df['Need_Maintenance']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Check unique values in y_test and y_pred
print("Unique values in y_test:", np.unique(y_test))
print("Unique values in y_pred:", np.unique(y_pred))

# Confusion Matrix
labels = np.unique(y_test)  # Use unique values from y_test for the labels
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Feature Importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plotting in a 2x2 subplot layout
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
plt.subplots_adjust(hspace=0.4, wspace=0.4)  # Adjust space between subplots

# Confusion Matrix
cm_display.plot(ax=axs[0, 0], cmap='Blues', values_format='d')
axs[0, 0].set_title('Confusion Matrix')

# ROC Curve
axs[0, 1].plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
axs[0, 1].plot([0, 1], [0, 1], color='red', linestyle='--')
axs[0, 1].set_xlabel('False Positive Rate')
axs[0, 1].set_ylabel('True Positive Rate')
axs[0, 1].set_title('Receiver Operating Characteristic (ROC) Curve')
axs[0, 1].legend(loc='lower right')

# Feature Importances
axs[1, 0].barh(range(X.shape[1]), importances[indices], align='center')
axs[1, 0].set_yticks(range(X.shape[1]))
axs[1, 0].set_yticklabels(X.columns[indices])
axs[1, 0].invert_yaxis()  # Inverse the y-axis to have the most important features at the top
axs[1, 0].set_xlabel('Feature Importance')
axs[1, 0].set_title('Feature Importances')

# Displaying Metrics in a Table
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Score': [accuracy, precision, recall, f1]
})

# Creating a table for metrics
axs[1, 1].axis('tight')
axs[1, 1].axis('off')
axs[1, 1].table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
axs[1, 1].set_title('Model Performance Metrics')

plt.show()


# **XGBoost Analysis**

In [None]:
# Install required packages if not already installed
!pip install xgboost shap
import joblib

joblib.dump(model, 'model.pkl')
joblib.dump(label_encoders, 'encoders.pkl')  # Save the label encoders separately

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve, roc_auc_score, accuracy_score,
                             precision_score, recall_score, f1_score)
from sklearn.preprocessing import LabelEncoder
import shap  # For SHAP explanations

# Load your DataFrame
df = pd.read_csv("vehicle_maintenance_data.csv")  # Update with actual data path

# Assuming df has a target column 'Need_Maintenance' and several feature columns
df['Need_Maintenance'] = df['Need_Maintenance'].map({1: 1, 0: 0})

# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Define features (X) and target (y)
X = df.drop(columns=['Need_Maintenance'])
y = df['Need_Maintenance']
print(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the XGBoost model
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(importance_df)

# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
print(y_pred)

# ROC Curve and AUC calculation
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Plotting
fig, axs = plt.subplots(2, 2, figsize=(12, 10))  # Create a 2x2 subplot layout

# Confusion Matrix
labels = np.unique(y_test)  # Use unique values from y_test for the labels
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
cm_display.plot(ax=axs[0, 0], cmap='Blues', values_format='d')
axs[0, 0].set_title('Confusion Matrix')

# ROC Curve
axs[0, 1].plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
axs[0, 1].plot([0, 1], [0, 1], color='red', linestyle='--')
axs[0, 1].set_xlabel('False Positive Rate')
axs[0, 1].set_ylabel('True Positive Rate')
axs[0, 1].set_title('Receiver Operating Characteristic (ROC) Curve')
axs[0, 1].legend(loc='lower right')

# Feature Importance
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis', ax=axs[1, 0])
axs[1, 0].set_title('Feature Importance Analysis using XGBoost')
axs[1, 0].set_xlabel('Importance Score')
axs[1, 0].set_ylabel('Features')
axs[1, 0].grid()

# Displaying Metrics in a Table
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Score': [accuracy, precision, recall, f1]
})

axs[1, 1].axis('tight')
axs[1, 1].axis('off')
axs[1, 1].table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
axs[1, 1].set_title('Model Performance Metrics')

plt.tight_layout()
plt.show()

# SHAP Analysis
# Initialize the SHAP explainer with the model and training data
explainer = shap.Explainer(model, X_train)

# Compute SHAP values for the test set
shap_values = explainer(X_test)

# Summary plot for overall feature importance using SHAP values
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Explanation for a single prediction (change the index to check different instances)
instance_index = 0  # You can modify this index to examine other instances
shap.initjs()  # Initialize JS for interactive plots (useful if in Jupyter)

# Waterfall plot to explain why a specific vehicle needs maintenance
shap.waterfall_plot(shap_values[instance_index], max_display=10)
