In [None]:
! pip install xgboost

In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Create a dataframe


In [None]:
df = pd.read_csv('../Datasets/engine_data.csv')

df.head(5)

# Split the Data

In [None]:
# Get the dataframe information
df.info()

In [None]:
#Imputation to resolve class imbalance

filter_1 = df[df['Engine Condition']==1]
filter_0 = df[df['Engine Condition']==0]

filter_1 = filter_1.drop(filter_1.sample(len(filter_1)-len(filter_0)).index).reset_index(drop=True)


df = pd.concat([filter_0, filter_1])


df['Engine Condition'].value_counts()

In [71]:
X, y = df.drop(columns=['Engine Condition']), df['Engine Condition']

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Data Analysis

In [121]:
def hist_plot(engine_health):

    filtered_df = df[df['Engine Condition']==engine_health]
    numeric_columns = filtered_df.select_dtypes(include=['float64', 'int64']).columns

    
    num_columns = len(numeric_columns)
    fig, axes = plt.subplots(1, num_columns, figsize=(6 * num_columns, 10))

    # If there's only one column, axes won't be a list, so we handle it separately
    if num_columns == 1:
        axes = [axes]

    # Plotting histograms for each numeric column
    for ax, col in zip(axes, numeric_columns):
        filtered_df[col].hist(ax=ax, bins=15, edgecolor='black')
        ax.set_title(f'{col} Histogram (Condition=1)')
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()



In [None]:
def box_plot(engine_health):
    filtered_df = df[df['Engine Condition']==engine_health]
    numeric_columns = filtered_df.select_dtypes(include=['float64', 'int64']).columns

    # Create histograms for each numeric column
    num_columns = len(numeric_columns)
    fig, axes = plt.subplots(1, num_columns, figsize=(6 * num_columns, 10))

    # If there's only one column, axes won't be a list, so we handle it separately
    if num_columns == 1:
        axes = [axes]

    # Plotting histograms for each numeric column
    for ax, col in zip(axes, numeric_columns):
        filtered_df[col].plot(kind='box' ,ax=ax)
        ax.set_title(f'{col} Box Plot (Condition=1)')
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()

box_plot(0)

# Scaling the Data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train) #Remembers Mean and STD of the X_train Data

In [80]:
X_train_trans = scaler.transform(X_train)  # Subtracts Mean and divides by STD from corresponding columns(each value)
X_test_trans = scaler.transform(X_test)

# Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_trans, y_train)

In [None]:
print(classification_report(y_pred=log_reg.predict(X_test_trans),y_true=y_test))

# KNN

In [92]:
def train_model(n_neighbors):
  knn = KNeighborsClassifier(n_neighbors = n_neighbors)
  knn.fit(X_train, y_train)

  # Predict on the test set
  y_pred = knn.predict(X_test)

  # Calculate accuracy

  return f1_score(y_test,y_pred)

sizes = np.arange(1,100)

f1 = [train_model(i) for i in sizes]


In [None]:
plt.figure()
plt.scatter(sizes, f1, marker='o')
plt.title('F1 Scores vs Model Size')
plt.xlabel('Model Size')
plt.ylabel('F1 Score')
plt.grid(True)
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_trans, y_train)

In [None]:
print(classification_report(y_pred=knn.predict(X_test_trans), y_true=y_test))

# Random Forest

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train_trans, y_train)

In [None]:
print(classification_report(y_pred=clf.predict(X_test_trans), y_true=y_test))

# SVM

In [None]:
svc = SVC(gamma='auto')
svc.fit(X_train_trans, y_train)

In [None]:
print(classification_report(y_pred=svc.predict(X_test_trans), y_true=y_test))

# XG boost

In [None]:
bst = XGBClassifier()

bst.fit(X_train_trans, y_train)


In [None]:
print(classification_report(y_pred=bst.predict(X_test_trans), y_true=y_test))