In [1]:
# !pip install ydata_profiling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

# from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

# Classification task

## Gather and preprocess your regression dataset.

In [3]:
# Using dataset given to you in previous self practice task.
# Loading Data from a JSON File
with open('/content/drive/MyDrive/Colab Notebooks/ML/Lab 4/order_cancellation_data.json', 'r') as file:
    data = pd.read_json(file)

In [4]:
# report = ProfileReport(data)
# report.to_file('data_profile_report.html')

In [5]:
# Replacing the categorical feature in the 'order_status' column.
data['order_status'] = data['order_status'].replace({'F':1, 'C':0})

In [6]:
# Sorting the data by date and dropping this column.
data.sort_values(by='order_create_time', inplace=True)
data.drop('order_create_time', axis=1, inplace=True)

In [7]:
# Dropping rows with zero values in the "total_order_items" column.
data = data[data['total_order_items'] > 0]
# Removing outliers (dropping all values greater than the 95th percentile).
data = data[data['total_order_items'] <= 8]

In [8]:
# I'm filling the missing (NaN) values in the "cost(USD)" column with the mean values.
data['cost(USD)'].fillna(data['cost(USD)'].mean(), inplace=True)
# I'm dropping rows with values less than 8 in the "cost(USD)" column.
data = data[data['cost(USD)'] >= 8]
# I'm removing outliers by dropping all values that are greater than the 95th percentile in the "cost(USD)" column.
data = data[data['cost(USD)'] <= 21.32]

In [9]:
# I'm dropping rows with zero values in the "payment_type" column.
data['payment_type'].dropna(inplace=True)

# We will convert categorical features into numerical ones using one-hot encoding.
data = pd.get_dummies(data, columns=['payment_type'])

In [10]:
# Filling in missing (NaN) values in the "vendor_client_distance" column with the mean values.
data['vendor_client_distance'].fillna(data['vendor_client_distance'].mean(), inplace=True)
# Removing outliers by dropping all values greater than the 95th percentile in the "vendor_client_distance" column.
data = data[data['vendor_client_distance'] <= 9818]

In [11]:
# Filling missing (NaN) values in the "estimated_delivery_time" column with the mean values.
data['estimated_delivery_time'].fillna(data['estimated_delivery_time'].mean(), inplace=True)
# Removing outliers by dropping all rows with values greater than the 95th percentile in the "estimated_delivery_time" column.
data = data[data['estimated_delivery_time'] <= 102]

In [12]:
# Filling missing (NaN) values in the "predicted_order_preparation_time" column with the mean values.
data['predicted_order_preparation_time'].fillna(data['predicted_order_preparation_time'].mean(), inplace=True)
# Removing outliers by dropping all rows with values greater than the 95th percentile in the "predicted_order_preparation_time" column.
data = data[data['predicted_order_preparation_time'] <= 31]

In [13]:
data.drop(columns=['vendor_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['vendor_id'], axis=1, inplace=True)


## Normalize or standardize your data (optional) and split it into training and testing sets.

In [14]:
# We split the data into features and the target variable.
X = data.drop('order_status', axis=1)
y = data[['order_status']]

In [15]:
# Divide the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, shuffle=False)

In [16]:
# # Data balancing
# smote = SMOTE()
# X_train_normalized_resampled, X_test_normalized_resampled = smote.fit_resample(X_train_normalized, X_test_normalized)

In [17]:
# # Scaling the data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train_resampled)
# X_test = scaler.transform(X_test)

In [18]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## <center>Self-practice <center>

Using Dataset from assignment 1
1. Define, train and evaluate an ANN for Regression and Classification
1. Plot the loss and accuracy of the model for each training iteration
    
ANN should be implemented in PyTorch

## Define the architecture of your ANN for classification, including the number of layers, units in each layer, and activation functions.

In [19]:
import tensorflow as tf
from tensorflow import keras

num_rows, num_features = X_train.shape
num_classes = 2

model = keras.Sequential([
    keras.layers.Input(shape=(num_features,)),  # Входной слой, num_features - количество признаков
    keras.layers.Dense(64, activation='relu'),  # Первый скрытый слой с 64 нейронами и функцией активации ReLU
    keras.layers.Dense(32, activation='relu'),  # Второй скрытый слой с 32 нейронами и функцией активации ReLU
    keras.layers.Dense(num_classes, activation='softmax')  # Выходной слой с функцией активации softmax для классификации
])

## Compile the Model:

*   Choose an appropriate loss function for classification, such as categorical cross-entropy.
*   Select an optimizer and specify evaluation metrics like accuracy.


In [20]:
model.compile(loss='categorical_crossentropy',  # Функция потерь для многоклассовой классификации
              optimizer='adam',  # Выбор оптимизатора, например, Adam
              metrics=['accuracy'])  # Метрика для оценки производительности, например, точность

## Training:

Train the model on your training data.

*   Train the model on your training data.
*   Monitor training progress and make adjustments as necessary to minimize the loss.

In [21]:
epochs = 10
batch_size = 16

In [22]:
X_train

array([[0.28571429, 0.28571429, 0.44594595, ..., 0.53846154, 1.        ,
        0.        ],
       [1.        , 1.        , 0.18768769, ..., 0.61538462, 1.        ,
        0.        ],
       [0.71428571, 0.71428571, 0.42117117, ..., 0.46153846, 0.        ,
        1.        ],
       ...,
       [0.42857143, 0.42857143, 0.17492492, ..., 0.88461538, 1.        ,
        0.        ],
       [0.42857143, 0.42857143, 0.32282282, ..., 0.88461538, 0.        ,
        1.        ],
       [0.28571429, 0.28571429, 0.26126126, ..., 0.92307692, 0.        ,
        1.        ]])

In [23]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/10


ValueError: ignored

## Evaluation:

*   Use the trained model to make predictions on your test data.
*   Calculate accuracy, precision, recall, F1-score, and confusion matrices to evaluate classification performance.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Получить прогнозы модели на тестовом наборе данных
y_pred = model.predict(X_test)

# Преобразовать вероятностные прогнозы в бинарные классы (0 или 1) с использованием порога (например, 0.5)
y_pred_binary = (y_pred > 0.5).astype(int)

# Вычислить метрики производительности
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

# Вывести результаты
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Построить матрицу ошибок
confusion = confusion_matrix(y_test, y_pred_binary)
print('Confusion Matrix:\n', confusion)