## In this notebook

- Load images and create features.

In [1]:
import os
import datetime
from tqdm import tqdm

# analytics
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np

# plot
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# model
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [2]:
PROJECT_FOLDER = "/Users/strvmac/projects/competition-drivendata-tick-tick-bloom/"

FEATURES_FOLDER = f"{PROJECT_FOLDER}data/features/"
METADATA_FILEPATH = f"{PROJECT_FOLDER}data/metadata.csv"
TRAIN_LABELS_FILEPATH = f"{PROJECT_FOLDER}data/train_labels.csv"
TEST_FORMAT_FILEPATH = f"{PROJECT_FOLDER}data/submission_format.csv"
SUBMISSIONS_FOLDER = f"{PROJECT_FOLDER}data/submissions/"

## Load metadata and labels

In [3]:
%%time

df = (
    pd
    .read_csv(METADATA_FILEPATH)
    .sort_values(by=["split", "date"], ascending=[False, True])
    .reset_index(drop=True)
)

df.date = pd.to_datetime(df.date)
df.tail()

CPU times: user 23.5 ms, sys: 2.96 ms, total: 26.4 ms
Wall time: 25.8 ms


Unnamed: 0,uid,latitude,longitude,date,split
23565,howu,36.7085,-121.749,2021-12-29,test
23566,nsoi,36.7368,-121.734,2021-12-29,test
23567,prfi,36.7518,-121.742,2021-12-29,test
23568,teuu,36.7723,-121.788,2021-12-29,test
23569,thki,36.7254,-121.73,2021-12-29,test


In [4]:
# check the split distribution

(
    df
    .split
    .value_counts()
)

train    17060
test      6510
Name: split, dtype: int64

In [5]:
# load train labels

df_train_labels = (
    pd
    .read_csv(TRAIN_LABELS_FILEPATH)
)

df_train_labels.tail()

Unnamed: 0,uid,region,severity,density
17055,zzsv,south,3,113125.0
17056,zzuq,south,3,175726.0
17057,zzwo,midwest,2,48510.0
17058,zzwq,south,1,1271.0
17059,zzyb,south,1,9682.0


In [6]:
# load test labels (with placeholders)

df_test_labels = (
    pd
    .read_csv(TEST_FORMAT_FILEPATH)
)

df_test_labels.tail()

Unnamed: 0,uid,region,severity
6505,zzpn,northeast,1
6506,zzrv,west,1
6507,zzsx,south,1
6508,zzvv,west,1
6509,zzzi,midwest,1


## Load features

In [7]:
def get_features(image: np.ndarray) -> list:
    return [
        *image.mean(axis=(1, 2)).astype(float).tolist(),
        *np.median(image, axis=(1, 2)).astype(float).tolist(),
        *np.quantile(image, q=0.01, axis=(1, 2)).astype(float).tolist(),
        *np.quantile(image, q=0.05, axis=(1, 2)).astype(float).tolist(),
        *np.quantile(image, q=0.25, axis=(1, 2)).astype(float).tolist(),
        *np.quantile(image, q=0.75, axis=(1, 2)).astype(float).tolist(),
        *np.quantile(image, q=0.99, axis=(1, 2)).astype(float).tolist(),
        *image.min(axis=(1, 2)).astype(float).tolist(),
        *image.max(axis=(1, 2)).astype(float).tolist(),
        *image.std(axis=(1, 2)).astype(float).tolist(),
    ]

In [8]:
# load images and compute features

features = {
    "train": {},
    "test": {},
}

for split in features.keys():

    for image_file_name in tqdm(
        os.listdir(os.path.join(FEATURES_FOLDER, f"{split}/")),
        total=len(df.loc[df.split==split])
    ):

        if not image_file_name.endswith(".npy"):
            continue

        image_file_path = os.path.join(FEATURES_FOLDER, f"{split}/{image_file_name}")

        with open(image_file_path, "rb") as image_file:
            image = np.load(image_file)

        features[split][image_file_name.split(".")[0]] = get_features(image)

        # break
        
    # break

 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 16900/17060 [00:12<00:00, 1370.01it/s]
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 6406/6510 [00:04<00:00, 1352.85it/s]


In [9]:
def create_df_from_features(features: dict):
    features_cnt = np.max([
        len(val) for val in features.values()
    ])
    
    col_names = [
        f"feat_{i}" for i in range(1, features_cnt + 1)
    ]
    
    return (
        pd
        .DataFrame(features)
        .T
        .set_axis(col_names, axis=1, copy=False)
        .reset_index(names=["uid"])
    )

In [10]:
# create pandas DataFrames

df_train = create_df_from_features(features.get("train"))
df_test = create_df_from_features(features.get("test"))

print(df_train.shape)
print(df_test.shape)

(16900, 31)
(6406, 31)


In [11]:
# enrich train dataset by labels

df_train = (
    df_train_labels
    .merge(df_train, on="uid", how="inner")
    .reset_index(drop=True)
)

df_train.tail()

Unnamed: 0,uid,region,severity,density,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21,feat_22,feat_23,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30
16895,zzsv,south,3,113125.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,0.0,0.0,0.0
16896,zzuq,south,3,175726.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16897,zzwo,midwest,2,48510.0,33.177489,49.532468,30.820346,26.0,43.0,26.0,22.0,35.0,22.61,22.0,37.0,23.0,24.0,40.0,25.0,27.0,46.75,28.0,150.73,147.17,109.34,21.0,34.0,22.0,175.0,162.0,117.0,25.238664,21.997688,15.892911
16898,zzwq,south,1,1271.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16899,zzyb,south,1,9682.0,70.76644,47.14059,28.172336,71.0,47.0,28.0,54.8,36.0,19.0,59.0,39.0,22.0,67.0,43.0,26.0,75.0,51.0,31.0,86.6,61.6,38.0,34.0,21.0,9.0,96.0,70.0,44.0,6.859283,5.558855,4.058781


## Create model

In [12]:
Y_COLUMN = "severity"
X_COLUMNS = [col for col in df_train.columns if col.startswith("feat_")]

In [13]:
# split data to X and y

df_train, df_val = train_test_split(df_train, test_size=0.05, random_state=42)

X_train = df_train.loc[:, X_COLUMNS]
y_train = df_train.loc[:, Y_COLUMN]
X_val = df_val.loc[:, X_COLUMNS]
y_val = df_val.loc[:, Y_COLUMN]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

X_train.shape=(16055, 30)
y_train.shape=(16055,)
X_val.shape=(845, 30)
y_val.shape=(845,)


In [14]:
%%time

# define numerical transformers

transformers_numerical = Pipeline(
    steps=[
        ("min_max_scaler", MinMaxScaler())
    ]
)

# define and fit pipeline

transformers = ColumnTransformer(
    transformers=[
        ("numerical", transformers_numerical, X_COLUMNS),
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("transformers", transformers),
    ("model", GradientBoostingClassifier())
])

pipeline.fit(X_train, y_train)

CPU times: user 24.6 s, sys: 64.4 ms, total: 24.7 s
Wall time: 24.7 s


In [15]:
# check performance on validation dataset

region_scores = []
df_val["pred"] = pipeline.predict(X_val)

for region in df_val.region.unique():
    sub = df_val[df_val.region == region]
    region_rmse = mean_squared_error(sub.severity, sub.pred, squared=False)
    print(f"RMSE for {region} (n={len(sub)}): {region_rmse:,.2f}")
    region_scores.append(region_rmse)

overall_rmse = np.mean(region_scores)
print(f"Final score: {overall_rmse:,.2f}")

RMSE for northeast (n=75): 1.19
RMSE for south (n=492): 1.14
RMSE for west (n=179): 1.98
RMSE for midwest (n=99): 1.55
Final score: 1.46


## Generate submission

In [16]:
# create predictions for df_test

df_test["pred"] = pipeline.predict(
    df_test.loc[:, X_COLUMNS]
)

df_test.tail()

Unnamed: 0,uid,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21,feat_22,feat_23,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30,pred
6401,caye,197.911255,198.417749,185.352814,193.0,193.0,179.0,123.0,130.61,114.0,138.05,144.05,126.1,174.0,175.0,159.0,226.0,226.0,214.0,255.0,255.0,255.0,114.0,120.0,98.0,255.0,255.0,255.0,35.073311,32.616703,36.697237,1
6402,ixjg,72.538961,87.147186,60.069264,60.0,80.5,50.0,21.61,32.22,25.61,31.0,46.05,31.0,47.0,69.0,42.0,84.75,100.0,71.0,201.56,181.17,145.39,18.0,25.0,19.0,224.0,190.0,157.0,39.627344,31.346064,28.421373,4
6403,ndze,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,0.0,0.0,0.0,1
6404,trgf,88.399093,80.14966,53.006803,88.0,72.0,49.0,25.0,30.4,12.0,30.0,36.0,15.0,42.0,49.0,21.0,131.0,112.0,85.0,160.6,142.6,107.6,22.0,26.0,10.0,164.0,144.0,120.0,45.753697,34.751079,31.238784,4
6405,mddn,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,0.0,0.0,0.0,1


In [17]:
%%time

# merge test labels with predictions
df_submission = (
    df_test_labels
    .merge(
        df_test.loc[:, ["uid", "pred"]], 
        on="uid", 
        how="left"
    )
)

# replace missing predictions by severity
df_submission["pred"] = (
    df_submission
    .pred
    .fillna(df_submission.severity)
    .astype(int)
)

# delete severity column and rename pred column
df_submission = (
    df_submission
    .drop(["severity"], axis=1)
    .rename(columns={"pred": "severity"})
)

assert df_submission.severity.isna().sum() == 0
df_submission.tail()

CPU times: user 5.55 ms, sys: 1.16 ms, total: 6.71 ms
Wall time: 5.7 ms


Unnamed: 0,uid,region,severity
6505,zzpn,northeast,1
6506,zzrv,west,4
6507,zzsx,south,1
6508,zzvv,west,1
6509,zzzi,midwest,1


In [18]:
# save submission

datetime_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
submission_path = os.path.join(SUBMISSIONS_FOLDER, f"{datetime_now}_sub.csv")

(
    df_submission
    .to_csv(submission_path, index=False)
)

In [19]:
! cat {submission_path} | head -5

uid,region,severity
aabn,west,1
aair,west,4
aajw,northeast,4
aalr,midwest,1


## Results

- Created the model and submission.