# Introduction 
This project serves as a re-introduction to data science and machine learning for me. I want to go back to the fundamentals, understanding why each move is made and how it impacts the project overall.

## Dataset

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
df = pd.read_csv("predictive_maintenance.csv")

# EDA

In [None]:
df.head()

#### Quick Observations on the data
- There is an identification number (UDI) that is unneccessary
- The Product ID column is also unnecessary for any analysis carried out as it is a unique identifier.
- This dataset has multiple target columns. Target and Failure Type. 

In [4]:
# Drop ID columns

df.drop(columns=['UDI','Product ID'], inplace=True)



In [5]:
df.rename(columns={'Type':'type', 'Air temperature [K]' :'air_temperature', 'Process temperature [K]':'process_temperature',
       'Rotational speed [rpm]':'rotational_speed', 'Torque [Nm]': 'torque', 'Tool wear [min]': 'tool_wear', 'Target': 'target',
       'Failure Type': 'failure_type'}, inplace=True)

In [None]:
df.info()

In [7]:
df['tool_wear'] = df['tool_wear'].astype('float64')
df['rotational_speed'] = df['rotational_speed'].astype('float64')

In [None]:
df.describe().transpose()

In [None]:
# Calculate the percentage of each type
type_counts = df['type'].value_counts()
type_percentage = 100 * type_counts / df['type'].shape[0]

# Prepare labels and values for the pie chart
labels = type_percentage.index
sizes = type_percentage.values

# Create a pie chart
plt.figure(figsize=(8, 6))  # Optional: Set figure size for better readability
plt.pie(sizes, labels=labels, colors=sns.color_palette('tab10')[:len(labels)], autopct='%.0f%%', startangle=90)
plt.title('Machine Type Percentage')
plt.axis('equal')  # Equal aspect ratio ensures that pie chart is drawn as a circle.

# Show the pie chart
plt.show()

This is a good enough split that does not indicate too much of oversampling. 

### Data Anomalies

Prior knowledge of this dataset confirms there are anomalies such as: 
- Values are classified as failure in the 'Target' variable but as No Failure in the 'failure_type' column.
- Values are classifed as Random Failures by 'Failure Type', but they are classifed as No failure by the 'Target Variable'

We have to eliminate them

In [None]:
df['failure_type'].value_counts()

In [None]:
df_failure = df[df['target'] == 1]
df_failure['failure_type'].value_counts()

This confirms the first anomaly and those values will be dropped. 

In [12]:
# Identify indices of rows with 'No Failure'
position_wrong_failures = df_failure[df_failure['failure_type'] == 'No Failure'].index

# Drop these indices from the original DataFrame
df.drop(position_wrong_failures, axis=0, inplace=True)

In [None]:
df_failure = df[df['target'] == 0]
df_failure['failure_type'].value_counts()

In [14]:
#get the position of these random failures
position_wrong_random_failures = df_failure[df_failure['failure_type'] == 'Random Failures'].index

#drop the columns
df.drop(position_wrong_random_failures, axis=0, inplace=True)

In [15]:
df.reset_index(inplace=True, drop=True)

27 data points out of 10000 will be fine. 0.27%

### Outlier Inspection

In [None]:
df.describe().transpose()

The max of `rotational_speed`, `torque` and `tool_wear` being significantly different from could indicate outliers

In [None]:
# Create a figure with subplots
fig, axes = plt.subplots(2, 5, figsize=[25, 10])
j = 0
colors = ['#E1728F', '#409E7D']

# List of features to plot
features = ['air_temperature', 'process_temperature', 'rotational_speed', 'torque', 'tool_wear']

for i in features:
    # Histogram with KDE
    sns.histplot(data=df, x=i, kde=True, ax=axes[0, j], hue='target', palette=colors)
    
    # Boxplot
    sns.boxplot(data=df, x=i, ax=axes[1, j], palette=['#976EBD'])
    
    j += 1

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

There are obviously outliers in the data to be dealt with later

## Preprocessing

### Data Augmentation

There were class balances as we could see which is a huge problem in machine learning problems. Some ways of solving class imbalances involve: 

- Under-sampling by deleting some data points from the majority class.
- Over-Sampling by copying rows of data resulting in the minority class.
- Over-Sampling with SMOTE (Synthetic Minority Oversampling Technique).

In [None]:
df_fail = df[df['failure_type'] != 'No Failure']

# Calculate failure type percentages
failure_counts = df_fail['failure_type'].value_counts()
df_fail_percentage = 100 * failure_counts / failure_counts.sum()

# Calculate overall failure percentage in the data
total_failures = df['target'].sum()
total_records = len(df)
overall_failure_percentage = round(100 * total_failures / total_records, 2)

# Print overall failure percentage
print('Failures percentage in data:', overall_failure_percentage)
print('Percentage of no failure in data:', 100 - overall_failure_percentage)

# Create a pie plot for failure causes
plt.title('Reasons for Machine Failures')
plt.pie(
    x=df_fail_percentage,
    labels=df_fail_percentage.index,
    colors=sns.color_palette('tab10')[0:4],
    autopct='%.0f%%'
)
plt.show()

96% of the data is a huge imbalance and we work to correct using SMOTE analysis. A link to an article written on the reasoning behind using that form of data augmentation will be included here.

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC

# Determine the number of 'No Failure' instances, which should represent 80% of the desired length
number_no_failure = df['failure_type'].value_counts()['No Failure']
desired_length = round(number_no_failure / 0.8)

# Calculate the number of samples needed for each failure type
samples_per_class = round((desired_length - number_no_failure) / 4)  # Distributing among four failure types

# Define the resampling strategy
resampling_strategy = {
    'No Failure': number_no_failure,
    'Overstrain Failure': samples_per_class,
    'Heat Dissipation Failure': samples_per_class,
    'Power Failure': samples_per_class,
    'Tool Wear Failure': samples_per_class
}

# Initialize the SMOTENC instance for categorical features
smote = SMOTENC(categorical_features=[0, 7], sampling_strategy=resampling_strategy, random_state=0)

# Resample the DataFrame
df_resampled, y_resampled = smote.fit_resample(df, df['failure_type'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Identify failures in the resampled DataFrame
failure_indices_resampled = df_resampled[df_resampled['failure_type'] != 'No Failure'].index
df_res_fail = df_resampled.loc[failure_indices_resampled]

# Calculate failure type percentages in the resampled DataFrame
failure_counts_resampled = df_res_fail['failure_type'].value_counts()
fail_res_percentage = 100 * failure_counts_resampled / df_res_fail.shape[0]

# Calculate percentage increment of observations after oversampling
percentage_increment = round((df_resampled.shape[0] - df.shape[0]) * 100 / df.shape[0], 2)

# Calculate percentage of failures in the resampled DataFrame
smote_resampled_failures_percentage = round(df_res_fail.shape[0] * 100 / df_resampled.shape[0], 2)

# Print results
print('Percentage increment of observations after oversampling:', percentage_increment)
print('SMOTE Resampled Failures percentage:', smote_resampled_failures_percentage)

# Create pie plots for failure causes
fig, axs = plt.subplots(ncols=2, figsize=(12, 4))
fig.suptitle('Causes Involved in Machine Failures')

# Original failures percentage
axs[0].pie(
    x=df_fail_percentage,
    labels=df_fail_percentage.index,
    colors=sns.color_palette('tab10')[0:4],
    autopct='%.0f%%'
)
axs[0].title.set_text('Original')

# Resampled failures percentage
axs[1].pie(
    x=fail_res_percentage,
    labels=fail_res_percentage.index,
    colors=sns.color_palette('tab10')[0:4],
    autopct='%.0f%%'
)
axs[1].title.set_text('After Resampling')

# Show plots
plt.show()

## Feature Scaling and Encoding

Scaling data is a difficult process as you have to choose the proper scaler to use 
- Use MinMaxScaler as your default
- Use RobustScaler if you have outliers and can handle a larger range
- Use StandardScaler if you need normalized features
- Use Normalizer sparingly - it normalizes rows, not columns

In [21]:
num_features = [feature for feature in features if df[feature].dtype=='float64']

In [None]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Define dictionaries for encoding categorical variables
type_encoding = {'L': 0, 'M': 1, 'H': 2}
cause_encoding = {
    'No Failure': 0,
    'Power Failure': 1,
    'Overstrain Failure': 2,
    'Heat Dissipation Failure': 3,
    'Tool Wear Failure': 4
}
# Create a copy of the DataFrame to avoid modifying the original
df_preprocessed = df_resampled.copy()

# Encoding categorical variables without using inplace
df_preprocessed['type'] = df_preprocessed['type'].replace(to_replace=type_encoding)
df_preprocessed['failure_type'] = df_preprocessed['failure_type'].replace(to_replace=cause_encoding)

# Scaling numeric features
df_preprocessed[num_features] = scaler.fit_transform(df_preprocessed[num_features])

In [None]:
from sklearn.decomposition import PCA

# Create PCA instance with the number of components equal to the number of features
pca = PCA(n_components=len(num_features))

# Fit PCA and transform the data, creating a DataFrame for the principal components
X_pca = pd.DataFrame(
    data=pca.fit_transform(df_preprocessed[num_features]),
    columns=[f'PC{i+1}' for i in range(len(num_features))]
)

# Calculate the explained variance ratio as a percentage
var_exp = pd.Series(
    data=100 * pca.explained_variance_ratio_,
    index=[f'PC{i+1}' for i in range(len(num_features))]
)

# Print the explained variance ratio per component
print('Explained variance ratio per component:')
print(round(var_exp, 2), sep='\n')

# Print the sum of the explained variance ratio for the first three components
explained_variance_three_components = round(var_exp.values[:3].sum(), 2)
print(f'Explained variance ratio with 3 components: {explained_variance_three_components}')

In [None]:
# PCA for Data visualization
pca3 = PCA(n_components=3)
X_pca3 = pd.DataFrame(
    data=pca3.fit_transform(df_preprocessed[num_features]),
      columns=['PC1','PC2','PC3'])

fig, axs = plt.subplots(ncols=3, figsize=(18,4))
fig.suptitle('Loadings magnitude')

pca_loadings = pd.DataFrame(data=pca3.components_, columns=num_features)
for j in range(3):
    ax = axs[j]
    sns.barplot(ax=ax, x=pca_loadings.columns, y=pca_loadings.values[j])
    ax.tick_params(axis='x', rotation=90)
    ax.title.set_text('PC'+str(j+1))
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(df_preprocessed.corr(), dtype=np.bool))
heatmap = sns.heatmap(df_preprocessed.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

## Modelling 

This dataset allows us to perform the two different types of classification. Binary and Multi-class classification. 

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, fbeta_score
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.inspection import permutation_importance
import time

# train-validation-test split
X, y = df_preprocessed[features], df_preprocessed[['target','failure_type']]
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.1, stratify=df_preprocessed['failure_type'], random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.11, stratify=y_trainval['failure_type'], random_state=0)

In [None]:
!pip 

In [None]:
import xgboost