In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Load the datasets
train_data = pd.read_csv("serviceTrainData.csv")
test_data = pd.read_csv("serviceTestData.csv")


In [46]:
import plotly.express as px
import pandas as pd

df = pd.read_csv("serviceTrainData.csv")
fig = px.histogram(df, x=['OilQual', 'EnginePerf', 'NormMileage', 'TyreWear', 'HVACwear'])

fig.update_layout(
title='Distrubition Plot')
fig.show()

In [58]:
train_data['Service'].value_counts()

No     232
Yes     83
Name: Service, dtype: int64

In [47]:
# Train empty values
pd.DataFrame({
    'column':list((train_data.isnull().mean() * 100).index),
    'null_perc': list((train_data.isnull().mean() * 100))
}).sort_values('null_perc',ascending=False)

Unnamed: 0,column,null_perc
0,OilQual,0.0
1,EnginePerf,0.0
2,NormMileage,0.0
3,TyreWear,0.0
4,HVACwear,0.0
5,Service,0.0


In [48]:
# Test empty values
pd.DataFrame({
    'column':list((test_data.isnull().mean() * 100).index),
    'null_perc': list((test_data.isnull().mean() * 100))
}).sort_values('null_perc',ascending=False)

Unnamed: 0,column,null_perc
0,OilQual,0.0
1,EnginePerf,0.0
2,NormMileage,0.0
3,TyreWear,0.0
4,HVACwear,0.0
5,Service,0.0


In [49]:

# Preprocessing: Split into features and target variables
X_train = train_data.drop("Service", axis=1)
y_train = train_data["Service"]

X_test = test_data.drop("Service", axis=1)
y_test = test_data["Service"]



In [50]:

# Label Encoding for the target variable
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


# Hyperparameter Tuning

In [51]:
# Hyperparameter Tuning
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Create a Random Forest classifier
model = RandomForestClassifier()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model with tuned hyperparameters
best_model = grid_search.best_estimator_


In [43]:
best_model = RandomForestClassifier(bootstrap= True,
 ccp_alpha= 0.0,
 class_weight= None,
 criterion= 'gini',
 max_depth= None,
 max_features= 'auto',
 max_leaf_nodes= None,
 max_samples= None,
 min_impurity_decrease= 0.0,
 min_impurity_split= None,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 n_estimators= 100,
 n_jobs= None,
 oob_score= False,
 random_state= None,
 verbose= 0,
 warm_start= False)

#grid_search = GridSearchCV(model, param_grid, cv=5)
best_model.fit(X_train, y_train)

# Get the best model with tuned hyperparameters
#best_model = grid_search.best_estimator_

RandomForestClassifier()

In [44]:

# Model Evaluation
predictions = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
report = classification_report(y_test, predictions)

print("Best Model:")
print(best_model)
print("\nAccuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:")
print(report)

Best Model:
RandomForestClassifier()

Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        36

    accuracy                           1.00       135
   macro avg       1.00      1.00      1.00       135
weighted avg       1.00      1.00      1.00       135



In [36]:
valid_df = pd.DataFrame(test_data.drop('Service',axis=1).iloc[7]).T

In [66]:
best_model.predict(valid_df)[0]

1

# Decision Tree Classifier

In [61]:
# Hyperparameter Tuning
param_grid = {
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Create a Random Forest classifier
model = DecisionTreeClassifier()

# Perform grid search to find the best hyperparameters
grid_search_dt = GridSearchCV(model, param_grid, cv=5)
grid_search_dt.fit(X_train, y_train)

# Get the best model with tuned hyperparameters
dt_best_model = grid_search.best_estimator_


In [62]:
dt_best_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [63]:

# Model Evaluation
predictions = dt_best_model.predict(X_test)
accuracy = dt_best_model.score(X_test, y_test)
report = classification_report(y_test, predictions)

print("Best Model:")
print(best_model)
print("\nAccuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:")
print(report)

Best Model:
RandomForestClassifier()

Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       1.00      1.00      1.00        36

    accuracy                           1.00       135
   macro avg       1.00      1.00      1.00       135
weighted avg       1.00      1.00      1.00       135



# Logistic Regression

In [67]:
from sklearn.linear_model import LogisticRegression

In [69]:
lr_best_model = LogisticRegression()

lr_best_model.fit(X_train, y_train)


LogisticRegression()

In [70]:

# Model Evaluation
predictions = lr_best_model.predict(X_test)
accuracy = lr_best_model.score(X_test, y_test)
report = classification_report(y_test, predictions)

print("Best Model:")
print(lr_best_model)
print("\nAccuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:")
print(report)

Best Model:
LogisticRegression()

Accuracy: 91.11%

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        99
           1       0.85      0.81      0.83        36

    accuracy                           0.91       135
   macro avg       0.89      0.88      0.88       135
weighted avg       0.91      0.91      0.91       135



In [73]:
report = classification_report(y_test, predictions, output_dict=True)
df = pd.DataFrame(report).transpose()

In [74]:
df

Unnamed: 0,precision,recall,f1-score,support
0,0.930693,0.949495,0.94,99.0
1,0.852941,0.805556,0.828571,36.0
accuracy,0.911111,0.911111,0.911111,0.911111
macro avg,0.891817,0.877525,0.884286,135.0
weighted avg,0.909959,0.911111,0.910286,135.0
