# Livrare Pachete (Package Delivery)

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Load data

In [2]:
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")
df_train

Unnamed: 0,id,distance_km,package_weight_kg,traffic_level,on_time
0,74,407.92,5.62,2,1
1,19,216.54,56.35,1,1
2,119,597.06,102.39,13,0
3,79,535.19,141.95,9,0
4,77,589.68,107.75,12,0
...,...,...,...,...,...
95,90,550.09,142.10,12,0
96,9,300.96,6.82,7,1
97,14,106.96,88.78,5,1
98,120,574.54,128.30,13,0


## Subtask 1

In [3]:
mean_traffic_level = df_test["traffic_level"].mean().item()
mean_traffic_level

8.02

In [4]:
subtask1_rows = [(1, 1, mean_traffic_level)]

## Subtask 2

In [5]:
std_traffic_level = df_test["traffic_level"].std().item()
std_traffic_level

3.559924328478036

In [6]:
subtask2_rows = [(2, 1, std_traffic_level)]

## Subtask 3

In [7]:
num_cols = ["distance_km", "package_weight_kg"]
cat_cols = ["traffic_level"]
ans_col = "on_time"

In [8]:
X_train, y_train = df_train[num_cols + cat_cols], df_train[ans_col]
X_test = df_test[num_cols + cat_cols]

In [9]:
scaler = StandardScaler()
encoder = OneHotEncoder(drop="first", sparse_output=False)

X_train_scaled = np.hstack([
    scaler.fit_transform(X_train[num_cols]), 
    encoder.fit_transform(X_train[cat_cols])
])
X_test_scaled = np.hstack([
    scaler.transform(X_test[num_cols]), 
    encoder.transform(X_test[cat_cols])
])

In [10]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_scaled, y_train)

In [11]:
preds = lr_clf.predict(X_test_scaled)
preds

array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0])

In [12]:
subtask3_rows = []
for id_, pred in zip(df_test["id"], preds):
    subtask3_rows.append((3, id_, pred))

## Save answers

In [13]:
submission_rows = subtask1_rows + subtask2_rows + subtask3_rows
df_submission = pd.DataFrame(submission_rows, columns=["subtaskID", "datapointID", "answer"])
df_submission.to_csv("submission.csv", index=False)

## Submission results

Subtask 1:
- Equal: 1
- Score: 20/20

Subtask 2:
- Equal: 1
- Score: 20/20

Subtask 3:
- Accuracy: 0.98
- Score: 60/60