# 🤾‍♂️ Consum Caloric (Caloric Consumption)

In [1]:
import csv
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

## Load data

In [2]:
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")
df_train

Unnamed: 0,User_ID,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Gender,Calories
0,14498311,79,165,73,13,79,40.0,male,54
1,16700865,28,148,48,6,91,39.4,female,28
2,17197650,27,187,82,25,101,40.8,male,128
3,16400540,40,173,71,16,93,40.4,male,69
4,15539631,40,183,90,5,80,39.0,male,14
...,...,...,...,...,...,...,...,...,...
8995,11236706,38,181,82,21,103,40.5,male,125
8996,15526188,63,172,70,3,85,38.6,female,13
8997,13474684,30,168,70,28,103,41.1,female,160
8998,13764532,45,164,61,5,84,39.1,female,20


In [3]:
df_test

Unnamed: 0,Subtask,User_ID,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Gender
0,5,12618012,67,185,91,17,97,40.4,male
1,5,19091177,34,189,87,23,98,40.5,male
2,5,14793975,26,167,66,20,94,40.1,female
3,5,17338539,78,175,73,21,107,40.6,female
4,5,16351902,50,187,81,4,82,39.2,female
...,...,...,...,...,...,...,...,...,...
2495,6,12838170,27,207,100,8,88,39.7,
2496,6,19782827,28,192,86,26,108,40.8,
2497,6,19941237,20,177,79,5,84,38.9,
2498,6,18089588,28,193,96,11,85,40.1,


## Subtask 1

In [4]:
num_rows_train = len(df_train)
num_rows_train

9000

In [5]:
subtask1_rows = [(1, 1, num_rows_train)]

## Subtask 2

In [6]:
num_males = (df_train["Gender"] == "male").sum().item()
num_males

4443

In [7]:
subtask2_rows = [(2, 1, num_males)]

## Subtask 3

In [8]:
avg_duration = df_train["Duration"].mean().item()
avg_duration

15.510666666666667

In [9]:
subtask3_rows = [(3, 1, avg_duration)]

## Subtask 4

In [10]:
num_ge_75_yo = (df_train["Age"] >= 75).sum().item()
num_ge_75_yo

412

In [11]:
subtask4_rows = [(4, 1, num_ge_75_yo)]

## Model training

In [12]:
X_train = df_train.copy()
X_train, y_train = X_train.drop(["User_ID", "Calories"], axis=1), X_train["Calories"]

In [13]:
X_train.dtypes

Age             int64
Height          int64
Weight          int64
Duration        int64
Heart_Rate      int64
Body_Temp     float64
Gender         object
dtype: object

In [14]:
X_train["Gender"].unique()

array(['male', 'female'], dtype=object)

In [15]:
X_train["Gender"] = X_train["Gender"].map(lambda x: x == "male").astype(bool)

In [16]:
pipeline = Pipeline([
    ("pre", StandardScaler()),
    ("reg", CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, loss_function="MAE", verbose=0, random_state=42))
])

In [17]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_mean_absolute_error") * -1

array([0.76192223, 0.79804302, 0.76956166, 0.7829987 , 0.7837851 ])

In [18]:
pipeline.fit(X_train, y_train)

## Subtask 5

In [19]:
X_test5 = df_test[df_test["Subtask"] == 5].drop(["Subtask", "User_ID"], axis=1)
X_test5["Gender"] = X_test5["Gender"].map(lambda x: x == "male").astype(bool)

In [20]:
preds5 = pipeline.predict(X_test5)

In [21]:
subtask5_rows = []
for id_, val in zip(df_test[df_test["Subtask"] == 5]["User_ID"], preds5):
    subtask5_rows.append((5, id_, val))

## Subtask 6

In [22]:
X_test6 = df_test[df_test["Subtask"] == 6].drop(["Subtask", "User_ID"], axis=1)

In [23]:
X_test6.describe()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,27.438,183.802,84.722,15.43,95.582,40.0272
std,4.983154,10.042135,9.963697,8.23071,9.521586,0.762353
min,20.0,155.0,60.0,1.0,73.0,37.4
25%,23.0,177.0,78.0,8.0,88.0,39.6
50%,27.0,183.5,84.0,16.0,96.0,40.2
75%,32.0,191.0,92.0,22.0,103.0,40.6
max,36.0,208.0,110.0,30.0,118.0,41.2


In [24]:
X_test6.isna().any()

Age           False
Height        False
Weight        False
Duration      False
Heart_Rate    False
Body_Temp     False
Gender         True
dtype: bool

In [25]:
X_test6["Gender"]

2000    NaN
2001    NaN
2002    NaN
2003    NaN
2004    NaN
       ... 
2495    NaN
2496    NaN
2497    NaN
2498    NaN
2499    NaN
Name: Gender, Length: 500, dtype: object

In [26]:
X_test6["Gender"] = True  # This is a male handball team

In [27]:
preds6 = pipeline.predict(X_test6)

In [28]:
subtask6_rows = []
for id_, val in zip(df_test[df_test["Subtask"] == 6]["User_ID"], preds6):
    subtask6_rows.append((6, id_, val))

## Save answers

In [29]:
submission_rows = [("subtaskID", "datapointID", "answer")] + subtask1_rows + subtask2_rows + subtask3_rows + \
                  subtask4_rows + subtask5_rows + subtask6_rows

with open("submission.csv", mode="w", newline="") as file:  # The judge has a weird setup, don't use df.to_csv!
    writer = csv.writer(file)
    writer.writerows(submission_rows)

## Submission results

Subtask 1:
- Equality: 1
- Score: 3/3

Subtask 2:
- Equality: 1
- Score: 4/4

Subtask 3:
- Equality: 1
- Score: 6/6

Subtask 4:
- Equality: 1
- Score: 7/7

Subtask 5:
- MAE: 0.8435
- Score: 60/60

Subtask 6:
- MAE: 0.908
- Score: 20/20