In [None]:
import UTILS
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import shap
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error
import itertools
import functools
import re

<h2>Data Reading</h2>

In [None]:
data=pd.read_csv("megastore-classification-dataset.csv")
data.head()

In [None]:
data['Order Date']=pd.to_datetime(data['Order Date'])
data['Order year']=data['Order Date'].dt.year
data['Order quarter']=data['Order Date'].dt.quarter
data = data.drop('Order Date', axis=1)

data['Ship Date']=pd.to_datetime(data['Ship Date'])
data['Ship year']=data['Ship Date'].dt.year
data['Ship quarter']=data['Ship Date'].dt.quarter
data = data.drop('Ship Date', axis=1)
data[["Main Category","Sub Category"]]=data["CategoryTree"].str.extract("'MainCategory': '(?P<Main_Category>[^']*)', 'SubCategory': '(?P<Sub_Category>[^']*)'")
data.drop(columns=['CategoryTree'],inplace=True)

<h2>Handling Nulls</h2>

In [None]:
data.isna().sum()

In [None]:
data.drop(columns=["Row ID","Order ID","Customer ID","City","Postal Code","Ship year","Ship quarter","Product Name","Country","Segment"],inplace=True)


In [None]:
data.columns

In [None]:
data["ReturnCategory"].unique()

In [None]:
transform_target={"High Loss":0,"Low Loss":1,"Low Profit":2,"Medium Profit":3,"High Profit":4}
data["ReturnCategoryNominal"]=data["ReturnCategory"].map(transform_target)

In [None]:
data.head()

In [None]:
data["state_mean"]=data["State"].map(UTILS.transform_ordinal_means(data,"State","ReturnCategoryNominal"))

In [None]:
data.head()

In [None]:
data["state_mean"].corr(data["ReturnCategoryNominal"])

In [None]:
data["sub_cat_encoded"]=data["Sub Category"].map(UTILS.transform_ordinal(data,"Sub Category","ReturnCategoryNominal"))

In [None]:
data["sub_cat_encoded"].corr(data["ReturnCategoryNominal"])

In [None]:
data.head()

In [None]:
data=UTILS.remove_outliers(data,["Sales"],threshold=2.5)

In [None]:
df_iter=data.drop(columns=["Ship Mode","Customer Name","State","Product ID","Sub Category"])

In [None]:
df_iter=pd.concat([df_iter,UTILS.one_hot_encode_columns(["Region","Main Category"],df_iter)],axis=1)

In [None]:
df_iter

In [None]:
df_iter.drop(columns=["Main Category","Region","ReturnCategory"],axis=1,inplace=True)

In [None]:
df_iter.head()

In [None]:
X,scaler=UTILS.normalize_feature(df_iter.drop(columns=["ReturnCategoryNominal","Order year"],axis=1))
y=df_iter["ReturnCategoryNominal"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [None]:
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lgreg=LogisticRegression(max_iter=50000,solver="lbfgs")
lgreg.fit(X=X_train,y=y_train)
y_pred=lgreg.predict(X_test)
accuracy_score(y_pred,y_test)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
classifier = SVC()

# Define the hyperparameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

# Perform grid search using cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy score
score = accuracy_score(y_test, y_pred)

# Print the accuracy score
print("Accuracy score:", score)

# Print the best parameters found by grid search
print("Best parameters:", grid_search.best_params_)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
classifier = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Accuracy score:", score)
print("Best parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))