In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# Reading train dataset into a Pandas DataFrame
train_df = pd.read_csv('/kaggle/input/playground-series-s3e17/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e17/test.csv')


In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
# Check Nulls
train_df.isna().sum()

In [None]:
original_columns = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]

new_columns = {col: col.split(' [')[0] for col in original_columns}
train_df = train_df.rename(columns=new_columns)

test_df = test_df.rename(columns=new_columns)
print(test_df.columns)


In [None]:
# Check Duplicates 
train_df.duplicated().sum()

In [None]:
test_df.isna().sum()

In [None]:
test_df.duplicated().sum()

In [None]:
# Encoding Type 
print(train_df['Type'].unique())
print(test_df['Type'].unique())

In [None]:
train_df["Type"] = train_df["Type"].replace({'L':0})
train_df["Type"] = train_df["Type"].replace({'M':1})
train_df["Type"] = train_df["Type"].replace({'H':2})

test_df["Type"] = test_df["Type"].replace({'L':0})
test_df["Type"] = test_df["Type"].replace({'M':1})
test_df["Type"] = test_df["Type"].replace({'H':2})

In [None]:
'''# Creating a LabelEncoder instance
label_encoder = LabelEncoder()
train_df['Type'] = label_encoder.fit_transform(train_df['Type'])
test_df['Type'] = label_encoder.transform(test_df['Type'])

# Encoding Product ID 
train_df['Product ID'] = label_encoder.fit_transform(train_df['Product ID'])

# Handle unseen labels in the test set
test_df['Product ID'] = test_df['Product ID'].map(
    lambda s: label_encoder.transform([s])[0] if s in label_encoder.classes_ else -1
)''''''


In [None]:
train_df = train_df.drop('id', axis=1)
X_test = test_df.drop('id', axis=1)

In [None]:
train_df = train_df.drop('Product ID', axis=1)
X_test = X_test.drop('Product ID', axis=1)

In [None]:
# Splitting the data into features (X) and target variable (y)
X_train = train_df.drop('Machine failure', axis=1)  # Assuming 'Type' is the target variable
y_train = train_df['Machine failure']

In [None]:
'''scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)'''

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
(len(y_train) - sum(y_train)) / sum(y_train)

In [None]:
# XGBoost model with ROC-AUC
xgb_model = xgb.XGBClassifier()
xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='roc_auc_ovo')  # OvO for multi-class
print(f'XGBoost Cross-Validation ROC-AUC: {xgb_scores.mean():.4f}')

# Random Forest model with ROC-AUC
rf_model = RandomForestClassifier()
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='roc_auc_ovo')
print(f'Random Forest Cross-Validation ROC-AUC: {rf_scores.mean():.4f}')

# k-Nearest Neighbors (KNN) model with ROC-AUC
knn_model = KNeighborsClassifier()
knn_scores = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='roc_auc_ovo')
print(f'KNN Cross-Validation ROC-AUC: {knn_scores.mean():.4f}')

# Logistic Regression model with ROC-AUC
logreg_model = LogisticRegression(class_weight = 'balanced')
logreg_scores = cross_val_score(logreg_model, X_train, y_train, cv=5, scoring='roc_auc_ovo')
print(f'Logistic Regression Cross-Validation ROC-AUC: {logreg_scores.mean():.4f}')

In [None]:
# Logistic Regression model with ROC-AUC
logreg_model = LogisticRegression()
logreg_scores = cross_val_score(logreg_model, X_train, y_train, cv=5, scoring='roc_auc_ovo')
print(f'Logistic Regression Cross-Validation ROC-AUC: {logreg_scores.mean():.4f}')

In [None]:
# Creating and fitting an XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

# Predicting probabilities on the test set
y_pred_proba = xgb_model.predict_proba(X_test)

In [None]:
proba_class_1 = y_pred_proba[:, 1]
proba_class_1

In [None]:
test_df['id'].shape

In [None]:
df_submit = pd.DataFrame({'id':test_df['id'],'Machine failure':proba_class_1})
df_submit.to_csv('submission.csv', index = False)