In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
sns.color_palette("husl", 10)
import missingno as msno

from scipy import stats
from scipy.stats import skew
from scipy.special import boxcox1p

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import DataConversionWarning
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
from warnings import filterwarnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

from xgboost import plot_importance
import shap

from fractions import Fraction
from string import ascii_letters

def frac_to_float(frac):
    try:
        if isinstance(frac, float):
            return frac
        if frac == "1 / 200s":
            frac = "1/200"
        return float(sum(Fraction(s.rstrip(ascii_letters)) for s in frac.split()))
    except:
        print(frac)
        
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [4]:
# Load/Preprocess data

sample_submission = pd.read_csv("data/sample_submission.csv")
train = pd.read_csv("data/train.csv")
train_colors = pd.read_csv("data/train_colors.csv")
train_keywords = pd.read_csv("data/train_keywords.csv")
test = pd.read_csv("data/test.csv")
test_colors = pd.read_csv("data/test_colors.csv")
test_keywords = pd.read_csv("data/test_keywords.csv")

In [5]:
# Preprocess train data
cam_make_map = {
 "ASUS": "Asus", "Cannon": "Canon", 
 'Canon, SVR 90mm, 15 images': "Canon",
 'Canon, SVR90, 15 images': "Canon",
 'Canon, SVR90mm, 11 images': "Canon",
 'Canon, WOGT71, 19 images': "Canon",
 'Canon, WOGT71, 26 images': "Canon",
 'Canon, WOGT71, 32 images': "Canon",
 "canon": "Canon",
 'DJI Mavic 2 Pro': "DJI",
 'DJI Mavic Pro': "DJI",
 'DJI Spark': "DJI",
 'FUJIFILM': "Fujifilm",
 'FujiFilm': "Fujifilm",
 'Fujica': "Fujifilm",
 "google": "Google",
 'HASSELBLAD': "Hasselblad",
 "HUAWEI": "Huawei",
 'LEICA CAMERA AG': "LEICA",
 'Leica Camera AG': "LEICA",
 'LG Electronics': "LGE",
 'NIKON': "Nikon",
 'NIKON CORPORATION': "Nikon",
 'OLYMPUS CORPORATION': "Olympus",
 'OLYMPUS IMAGING CORP.': "Olympus",
 'PENTAX Corporation': "Pentax",
 'PENTAX RICOH IMAGING': "Pentax",
 "PENTAX": "Pentax",
 'RICOH IMAGING COMPANY, LTD.': "Ricoh",
 "SAMSUNG": "Samsung",
 "SONY": "Sony",
 "XIAOMI": "Xiaomi",
 'XIAOYI': "Xiaomi",
 'ZWO WOGT71': "ZWO",
 'ZWO w/ WOGT71 21 frames each SHO': "ZWO",
 'ZWO, WOGT71, 15 frames each of H, O, S': "ZWO",
 'motorola': "Motorola",
 "nikon": "Nikon",
 "olympus": "Olympus",
 "samsung": "Samsung",
 "sony": "Sony",
 }

train = train[(train["stats_downloads"] < 400000)]
train["image_category"] = train["image_category"].replace({"Featured": 1, "Not Featured": 0})
train["camera_make"] = train["camera_make"].replace(cam_make_map)
train["camera_make"] = train["camera_make"].apply(lambda x: str(x).lower())
make_counts = train["camera_make"].value_counts()

# Remove instances where camera_make <= 10
train = train[train["camera_make"].isin(make_counts[make_counts > 10].index)]
train["exposure_time"] = train["exposure_time"].apply(frac_to_float)
train["photo_submitted_at"] = pd.to_datetime(train["photo_submitted_at"])

# Fill Missing Values
train["aperture_value"] = train["aperture_value"].fillna(train["aperture_value"].mean())
train["iso"] = train["iso"].fillna(train["iso"].mean())
train["exposure_time"] = train["exposure_time"].fillna(train["exposure_time"].mean())

# Fill missing camera_make data based on camera_model data
make_replace_id = ["eca7e87d03", "bee68f9815", "5e15dcf061", "efd3bf77ec", "63cd2598c2", "759de7f18d", "8b215885c7", "c83069821b", "4bdba7914b", "913cc10182"]
make_replace_str = ["olympus", "nikon", "canon", "nikon", "nikon", "nikon", "canon", "sony", "olympus", "olympus"]
for replace_id, replace_str in zip(make_replace_id, make_replace_str):
    train.loc[train["image_id"] == replace_id, "camera_make"] = replace_str

# Drop unnecessary columns
train.drop(columns=["photo_url", "photo_image_url", "photo_submitted_at", "latitude", "longitude", "country", "city"], inplace=True)

# Drop columns for now
# train.drop(columns=["obs_hour", "obs_min", "obs_sec", "width", "height", "iso", "aperture_value", "focal_length", "exposure_time", "description1", "description2", "camera_model"], inplace=True)

# Day and hour may be important, keep aspect ratio because image dimensions make sense to affect download rates (if image is shaped too tall or wide it could be negative)

train_num = train.select_dtypes(include=np.number)
train

Unnamed: 0,image_id,obs_day,obs_hour,obs_min,obs_sec,image_category,width,height,aspect_ratio,description1,description2,camera_make,camera_model,iso,aperture_value,focal_length,exposure_time,total_days,stats_downloads
0,68105c019b,Thursday,15,13,10,1,2592,1728,1.500,Pristine water lily,rule of thirds photography of pink and white l...,canon,Canon EOS 60D,200.000,14.000,50.000,0.017,2361,167503
1,06d11c4edd,Wednesday,21,59,51,1,4000,6000,0.670,,sand in desert,sony,ILCE-6500,100.000,2.500,30.000,0.001,1255,1839
2,f6599edba1,Tuesday,4,10,11,1,5504,8256,0.670,,green trees near brown mountain during daytime,nikon,NIKON D850,64.000,2.800,48.000,0.001,1138,787
3,0f4bcc2d36,Monday,23,46,21,1,3627,2040,1.780,,gray dock aerial photography,dji,FC220,139.000,2.200,4.700,0.010,1579,2049
4,74db502ed5,Saturday,11,4,20,1,4896,3264,1.500,,people walking on green grass field near lake ...,fujifilm,X-T10,400.000,6.400,22.200,0.002,1050,2982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12192,18eaca644e,Monday,13,7,25,1,4480,6111,0.730,,calm water during night time,canon,Canon EOS 5D Mark IV,100.000,5.600,300.000,0.003,1265,2353
12193,eb25cd769e,Monday,20,32,28,1,2403,3337,0.720,,snowflakes on ground,canon,Canon EOS 5D Mark III,400.000,5.000,100.000,0.003,1503,3541
12194,73302461d0,Thursday,1,54,16,1,5184,3456,1.500,Inside of the small wave,,canon,Canon EOS M,400.000,4.000,21.000,0.000,2501,14581
12195,f2587b2b08,Thursday,18,2,32,1,6000,4000,1.500,Tropical palm,low angle of banana tree,nikon,NIKON D5300,800.000,1.800,50.000,0.003,2508,9247


In [6]:
# Preprocess test data
test["image_category"] = test["image_category"].replace({"Featured": 1, "Not Featured": 0})
test["camera_make"] = test["camera_make"].replace(cam_make_map)
test["camera_make"] = test["camera_make"].apply(lambda x: str(x).lower())

test["exposure_time"] = test["exposure_time"].apply(frac_to_float)
test["photo_submitted_at"] = pd.to_datetime(test["photo_submitted_at"])

# Fill Missing Values
test["aperture_value"] = test["aperture_value"].fillna(test["aperture_value"].mean())
test["iso"] = test["iso"].fillna(test["iso"].mean())
test["exposure_time"] = test["exposure_time"].fillna(test["exposure_time"].mean())

# Fill missing camera_make data based on camera_model data
# make_replace_id = ["eca7e87d03", "bee68f9815", "5e15dcf061", "efd3bf77ec", "63cd2598c2", "759de7f18d", "8b215885c7", "c83069821b", "4bdba7914b", "913cc10182"]
# make_replace_str = ["olympus", "nikon", "canon", "nikon", "nikon", "nikon", "canon", "sony", "olympus", "olympus"]
# for replace_id, replace_str in zip(make_replace_id, make_replace_str):
#     test.loc[test["image_id"] == replace_id, "camera_make"] = replace_str

# Drop unnecessary columns
test.drop(columns=["photo_url", "photo_image_url", "photo_submitted_at", "latitude", "longitude", "country", "city"], inplace=True)

# Day and hour may be important, keep aspect ratio because image dimensions make sense to affect download rates (if image is shaped too tall or wide it could be negative)

test_num = test.select_dtypes(include=np.number)
test

63 Seconds
30"


Unnamed: 0,image_id,obs_day,obs_hour,obs_min,obs_sec,image_category,width,height,aspect_ratio,description1,description2,camera_make,camera_model,iso,aperture_value,focal_length,exposure_time,total_days
0,2322208d63,Wednesday,14,32,57,1,4016,6016,0.670,,brown leaf,nikon,NIKON D750,100.000,4.500,50.000,0.002,1249
1,847b5fcee1,Saturday,17,15,26,1,6240,4160,1.500,,gray seal,canon,Canon EOS 6D Mark II,100.000,5.600,300.000,0.005,1211
2,0e1a37b065,Tuesday,6,54,42,1,5472,3648,1.500,,,canon,Canon EOS 6D,100.000,7.100,50.000,0.001,1439
3,8c8c9d4355,Tuesday,14,49,46,1,4000,6000,0.670,,landscape photography of desert,sony,ILCE-7M2,100.000,8.000,85.000,0.005,1194
4,8ff19c3747,Friday,1,32,29,1,6000,4000,1.500,,brown grass,sony,ILCE-6000,160.000,5.600,93.000,0.006,1457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6563,a41cfba199,Tuesday,10,13,49,1,3456,4472,0.770,,woman fixing flowers in pot,canon,Canon EOS 650D,400.000,1.800,50.000,0.002,1285
6564,73bbb77964,Monday,10,33,45,1,4850,6791,0.710,Autumn vibes,brown mushroom in ground,sony,ILCE-7RM2,320.000,4.500,69.000,0.040,1258
6565,22028b67b0,Sunday,16,24,57,1,4480,6720,0.670,,,canon,Canon EOS R,320.000,2.800,55.000,0.001,1315
6566,7abf54e623,Monday,19,45,53,1,2918,4377,0.670,"Hat in a car. Saw it in Klagenfurt, Austria",white and black hat,fujifilm,X-T1,250.000,4.500,35.000,0.001,1426


In [23]:
# Determine feature importance ordered (Feature importance derived from XGBoost)

feature_importance = ['total_days',
 'image_category',
 'exposure_time',
 'height',
 'aspect_ratio',
 'width',
 'iso',
 'obs_sec',
 'focal_length',
 'obs_min',
 'aperture_value',
 'obs_hour']

In [15]:
# Split data
data = train.select_dtypes(include=np.number)
X = data.drop(columns="stats_downloads")
y = data["stats_downloads"]

X_subset = X[feature_importance[:7]]

X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=8)

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
# imptr = SimpleImputer(strategy="mean", add_indicator=False)

kf = KFold(n_splits=5)

xgb = XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                              colsample_bylevel=0.5, colsample_bynode=0.5,
                              colsample_bytree=1.0, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              gamma=0.0, gpu_id=-1, grow_policy='depthwise',
                              importance_type=None, interaction_constraints='',
                              learning_rate=0.06416530723807765, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=6, max_leaves=0, min_child_weight=1,
                              missing=np.nan, monotone_constraints='()',
                              n_estimators=100, n_jobs=0, num_parallel_tree=1,
                              predictor='auto', random_state=8, reg_alpha=0.0,
                              reg_lambda=10.0,)

scores = cross_val_score(xgb, X_subset, y, scoring="neg_root_mean_squared_error", cv=kf, n_jobs=1)

print(f"Mean RMSE: {np.mean(scores)}, STD RMSE: {np.std(scores)}")



Mean RMSE: -22123.722021447447, STD RMSE: 850.4901814683802


In [None]:
# Best: Mean RMSE: -22123.722021447447, STD RMSE: 850.4901814683802