In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
sns.color_palette("husl", 10)
import missingno as msno

from scipy import stats
from scipy.stats import skew
from scipy.special import boxcox1p

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import DataConversionWarning
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
from warnings import filterwarnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from fractions import Fraction
from string import ascii_letters

def frac_to_float(frac):
    try:
        if isinstance(frac, float):
            return frac
        if frac == "1 / 200s":
            frac = "1/200"
        return float(sum(Fraction(s.rstrip(ascii_letters)) for s in frac.split()))
    except:
        print(frac)

In [52]:
# Load/Preprocess data

train = pd.read_csv("train.csv")
train_colors = pd.read_csv("train_colors.csv")
train_keywords = pd.read_csv("train_keywords.csv")
test = pd.read_csv("test.csv")
test_colors = pd.read_csv("test_colors.csv")
test_keywords = pd.read_csv("test_keywords.csv")

cam_make_map = {
 "ASUS": "Asus", "Cannon": "Canon", 
 'Canon, SVR 90mm, 15 images': "Canon",
 'Canon, SVR90, 15 images': "Canon",
 'Canon, SVR90mm, 11 images': "Canon",
 'Canon, WOGT71, 19 images': "Canon",
 'Canon, WOGT71, 26 images': "Canon",
 'Canon, WOGT71, 32 images': "Canon",
 "canon": "Canon",
 'DJI Mavic 2 Pro': "DJI",
 'DJI Mavic Pro': "DJI",
 'DJI Spark': "DJI",
 'FUJIFILM': "Fujifilm",
 'FujiFilm': "Fujifilm",
 'Fujica': "Fujifilm",
 "google": "Google",
 'HASSELBLAD': "Hasselblad",
 "HUAWEI": "Huawei",
 'LEICA CAMERA AG': "LEICA",
 'Leica Camera AG': "LEICA",
 'LG Electronics': "LGE",
 'NIKON': "Nikon",
 'NIKON CORPORATION': "Nikon",
 'OLYMPUS CORPORATION': "Olympus",
 'OLYMPUS IMAGING CORP.': "Olympus",
 'PENTAX Corporation': "Pentax",
 'PENTAX RICOH IMAGING': "Pentax",
 "PENTAX": "Pentax",
 'RICOH IMAGING COMPANY, LTD.': "Ricoh",
 "SAMSUNG": "Samsung",
 "SONY": "Sony",
 "XIAOMI": "Xiaomi",
 'XIAOYI': "Xiaomi",
 'ZWO WOGT71': "ZWO",
 'ZWO w/ WOGT71 21 frames each SHO': "ZWO",
 'ZWO, WOGT71, 15 frames each of H, O, S': "ZWO",
 'motorola': "Motorola",
 "nikon": "Nikon",
 "olympus": "Olympus",
 "samsung": "Samsung",
 "sony": "Sony",
 }

train = train[(train["stats_downloads"] < 300000)]
train["image_category"] = train["image_category"].replace({"Featured": 1, "Not Featured": 0})
train["camera_make"] = train["camera_make"].replace(cam_make_map)
train["camera_make"] = train["camera_make"].apply(lambda x: str(x).lower())
make_counts = train["camera_make"].value_counts()

# Remove instances where camera_make <= 10
train = train[train["camera_make"].isin(make_counts[make_counts > 10].index)]
train["exposure_time"] = train["exposure_time"].apply(frac_to_float)
train["photo_submitted_at"] = pd.to_datetime(train["photo_submitted_at"])

# Fill Missing Values
train["aperture_value"] = train["aperture_value"].fillna(train["aperture_value"].mean())
train["iso"] = train["iso"].fillna(train["iso"].mean())
train["exposure_time"] = train["exposure_time"].fillna(train["exposure_time"].mean())

# Fill missing camera_make data based on camera_model data
make_replace_id = ["eca7e87d03", "bee68f9815", "5e15dcf061", "efd3bf77ec", "63cd2598c2", "759de7f18d", "8b215885c7", "c83069821b", "4bdba7914b", "913cc10182"]
make_replace_str = ["olympus", "nikon", "canon", "nikon", "nikon", "nikon", "canon", "sony", "olympus", "olympus"]
for replace_id, replace_str in zip(make_replace_id, make_replace_str):
    train.loc[train["image_id"] == replace_id, "camera_make"] = replace_str

# Drop unnecessary columns
train.drop(columns=["photo_url", "photo_image_url", "photo_submitted_at", "obs_min", "obs_sec", "width", "height", "iso", "aperture_value", "focal_length", "exposure_time", "latitude", "longitude", "country", "city"], inplace=True)

# Day and hour may be important, keep aspect ratio because image dimensions make sense to affect download rates (if image is shaped too tall or wide it could be negative)

train_num = train.select_dtypes(include=np.number)
train

Unnamed: 0,image_id,obs_day,obs_hour,image_category,aspect_ratio,description1,description2,camera_make,camera_model,total_days,stats_downloads
0,68105c019b,Thursday,15,1,1.500,Pristine water lily,rule of thirds photography of pink and white l...,canon,Canon EOS 60D,2361,167503
1,06d11c4edd,Wednesday,21,1,0.670,,sand in desert,sony,ILCE-6500,1255,1839
2,f6599edba1,Tuesday,4,1,0.670,,green trees near brown mountain during daytime,nikon,NIKON D850,1138,787
3,0f4bcc2d36,Monday,23,1,1.780,,gray dock aerial photography,dji,FC220,1579,2049
4,74db502ed5,Saturday,11,1,1.500,,people walking on green grass field near lake ...,fujifilm,X-T10,1050,2982
...,...,...,...,...,...,...,...,...,...,...,...
12192,18eaca644e,Monday,13,1,0.730,,calm water during night time,canon,Canon EOS 5D Mark IV,1265,2353
12193,eb25cd769e,Monday,20,1,0.720,,snowflakes on ground,canon,Canon EOS 5D Mark III,1503,3541
12194,73302461d0,Thursday,1,1,1.500,Inside of the small wave,,canon,Canon EOS M,2501,14581
12195,f2587b2b08,Thursday,18,1,1.500,Tropical palm,low angle of banana tree,nikon,NIKON D5300,2508,9247


In [53]:
# test = test[(test["stats_downloads"] < 300000)]
test["image_category"] = test["image_category"].replace({"Featured": 1, "Not Featured": 0})
test["camera_make"] = test["camera_make"].replace(cam_make_map)
test["camera_make"] = test["camera_make"].apply(lambda x: str(x).lower())

test["exposure_time"] = test["exposure_time"].apply(frac_to_float)
test["photo_submitted_at"] = pd.to_datetime(test["photo_submitted_at"])

# Fill Missing Values
test["aperture_value"] = test["aperture_value"].fillna(test["aperture_value"].mean())
test["iso"] = test["iso"].fillna(test["iso"].mean())
test["exposure_time"] = test["exposure_time"].fillna(test["exposure_time"].mean())

# Fill missing camera_make data based on camera_model data
# make_replace_id = ["eca7e87d03", "bee68f9815", "5e15dcf061", "efd3bf77ec", "63cd2598c2", "759de7f18d", "8b215885c7", "c83069821b", "4bdba7914b", "913cc10182"]
# make_replace_str = ["olympus", "nikon", "canon", "nikon", "nikon", "nikon", "canon", "sony", "olympus", "olympus"]
# for replace_id, replace_str in zip(make_replace_id, make_replace_str):
#     test.loc[test["image_id"] == replace_id, "camera_make"] = replace_str

# Drop unnecessary columns
test.drop(columns=["photo_url", "photo_image_url", "photo_submitted_at", "obs_min", "obs_sec", "width", "height", "iso", "aperture_value", "focal_length", "exposure_time", "latitude", "longitude", "country", "city"], inplace=True)

# Day and hour may be important, keep aspect ratio because image dimensions make sense to affect download rates (if image is shaped too tall or wide it could be negative)

test_num = test.select_dtypes(include=np.number)
test

63 Seconds
30"


Unnamed: 0,image_id,obs_day,obs_hour,image_category,aspect_ratio,description1,description2,camera_make,camera_model,total_days
0,2322208d63,Wednesday,14,1,0.670,,brown leaf,nikon,NIKON D750,1249
1,847b5fcee1,Saturday,17,1,1.500,,gray seal,canon,Canon EOS 6D Mark II,1211
2,0e1a37b065,Tuesday,6,1,1.500,,,canon,Canon EOS 6D,1439
3,8c8c9d4355,Tuesday,14,1,0.670,,landscape photography of desert,sony,ILCE-7M2,1194
4,8ff19c3747,Friday,1,1,1.500,,brown grass,sony,ILCE-6000,1457
...,...,...,...,...,...,...,...,...,...,...
6563,a41cfba199,Tuesday,10,1,0.770,,woman fixing flowers in pot,canon,Canon EOS 650D,1285
6564,73bbb77964,Monday,10,1,0.710,Autumn vibes,brown mushroom in ground,sony,ILCE-7RM2,1258
6565,22028b67b0,Sunday,16,1,0.670,,,canon,Canon EOS R,1315
6566,7abf54e623,Monday,19,1,0.670,"Hat in a car. Saw it in Klagenfurt, Austria",white and black hat,fujifilm,X-T1,1426


In [54]:
test_data = test.select_dtypes(include=np.number)
test_data

Unnamed: 0,obs_hour,image_category,aspect_ratio,total_days
0,14,1,0.670,1249
1,17,1,1.500,1211
2,6,1,1.500,1439
3,14,1,0.670,1194
4,1,1,1.500,1457
...,...,...,...,...
6563,10,1,0.770,1285
6564,10,1,0.710,1258
6565,16,1,0.670,1315
6566,19,1,0.670,1426


In [55]:
test.dtypes

image_id           object
obs_day            object
obs_hour            int64
image_category      int64
aspect_ratio      float64
description1       object
description2       object
camera_make        object
camera_model       object
total_days          int64
dtype: object

In [56]:
test.isnull().sum()

image_id             0
obs_day              0
obs_hour             0
image_category       0
aspect_ratio         0
description1      3803
description2       346
camera_make          0
camera_model        29
total_days           0
dtype: int64

In [57]:
# len(train[train["camera_make"] == "nan"])
len(train[train["camera_make"] == "nan"])

30

In [58]:
train.dtypes

image_id            object
obs_day             object
obs_hour             int64
image_category       int64
aspect_ratio       float64
description1        object
description2        object
camera_make         object
camera_model        object
total_days           int64
stats_downloads      int64
dtype: object

In [59]:
train.isnull().sum()

image_id              0
obs_day               0
obs_hour              0
image_category        0
aspect_ratio          0
description1       7019
description2        650
camera_make           0
camera_model         46
total_days            0
stats_downloads       0
dtype: int64

In [60]:
make = []
counts = []
avg_downloads = []
for group, data in train.groupby("camera_make"):
    make.append(group)
    counts.append(len(data))
    avg_downloads.append(data["stats_downloads"].mean())
make_downloads = pd.DataFrame({"make": make, "count": counts, "avg_downloads": avg_downloads})
make_downloads

Unnamed: 0,make,count,avg_downloads
0,apple,318,12371.437
1,canon,4857,9511.06
2,dji,452,9626.885
3,fujifilm,854,10541.172
4,google,30,8813.033
5,gopro,38,7972.711
6,hasselblad,150,7493.133
7,huawei,22,7587.727
8,leica,35,8146.343
9,,30,7874.533


In [61]:
train[train["camera_model"].isnull()].index
train[train["camera_make"] == "nan"][["image_id", "camera_make", "camera_model"]]

make_replace_id = ["eca7e87d03", "bee68f9815", "5e15dcf061", "efd3bf77ec", "63cd2598c2", "759de7f18d", "8b215885c7", "c83069821b", "4bdba7914b", "913cc10182"]
make_replace_str = ["olympus", "nikon", "canon", "nikon", "nikon", "nikon", "canon", "sony", "olympus", "olympus"]

In [62]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12121 entries, 0 to 12196
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   image_id         12121 non-null  object 
 1   obs_day          12121 non-null  object 
 2   obs_hour         12121 non-null  int64  
 3   image_category   12121 non-null  int64  
 4   aspect_ratio     12121 non-null  float64
 5   description1     5102 non-null   object 
 6   description2     11471 non-null  object 
 7   camera_make      12121 non-null  object 
 8   camera_model     12075 non-null  object 
 9   total_days       12121 non-null  int64  
 10  stats_downloads  12121 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 1.1+ MB


In [63]:
train["description2"].value_counts()

body of water                                                64
green-leafed plant                                           30
snow covered mountain during daytime                         26
white clouds                                                 21
snow covered mountain                                        21
                                                             ..
photo of snowy mountain                                       1
brown chipmunk on branch of tree                              1
aerial photography of concrete road near trees at daytime     1
Cinque Terre in Italy during daytime                          1
field of assorted-color-petaled flowers                       1
Name: description2, Length: 9828, dtype: int64

In [64]:
train[train["camera_model"].isnull()]

Unnamed: 0,image_id,obs_day,obs_hour,image_category,aspect_ratio,description1,description2,camera_make,camera_model,total_days,stats_downloads
676,9a12bb8fe9,Wednesday,11,1,1.33,Wild Blues 🌊,bird's-eye photography of body of water,dji,,1459,2977
2286,5aaa9a4bde,Monday,19,1,1.5,,brown and green grass field during sunset,,,1419,1372
2612,63c0ba214e,Monday,19,1,0.56,,scenery of forest trees,nikon,,1937,167386
2737,90bd79188c,Monday,20,1,1.5,,brown antelope,canon,,1363,2911
3134,2618e3780c,Sunday,0,1,1.33,,two boat on body of water,,,2519,8984
3333,aba93959bc,Wednesday,3,1,0.67,Pier with water splashing at sunset,brown wooden dock on sea during daytime,,,1144,887
3961,28e7a4e710,Wednesday,9,1,0.8,Blue ambiance in the forest,green trees on foggy weather,,,1025,4876
4018,454bb547b6,Monday,13,1,1.25,A forest in the snow,withered trees surrounded by snow,,,2616,20051
4028,8034cf0628,Sunday,16,1,1.5,Football on Faroe islands #4,women's white shirt,,,1343,2599
4145,629b45abeb,Tuesday,8,1,0.67,Butterfly on Leaf,brown and black butterfly on white flower,,,1096,14598


In [65]:
# Categorical: obs_day, camera_make, camera_model
make_counts = train["camera_make"].value_counts()
make_counts
# make_counts[make_counts > 10].index

canon         4857
nikon         2643
sony          2055
fujifilm       854
dji            452
apple          318
panasonic      264
olympus        168
hasselblad     150
ricoh          113
samsung         51
gopro           38
leica           35
google          30
nan             30
pentax          28
huawei          22
xiaomi          13
Name: camera_make, dtype: int64

In [66]:
train.describe()

Unnamed: 0,obs_hour,image_category,aspect_ratio,total_days,stats_downloads
count,12121.0,12121.0,12121.0,12121.0,12121.0
mean,12.667,0.992,1.14,1572.249,9535.02
std,6.641,0.091,0.413,529.571,22579.971
min,0.0,0.0,0.34,897.0,152.0
25%,7.0,1.0,0.67,1201.0,1502.0
50%,14.0,1.0,1.33,1414.0,2769.0
75%,18.0,1.0,1.5,1588.0,7705.0
max,23.0,1.0,3.95,3562.0,299344.0


In [68]:
corrs = []
for col in train_num.columns[:-1]:
    corr = train[col].corr(train["stats_downloads"])
    corrs.append(corr)
    # print(f"Column {col}: {corr}")

corr_df = pd.DataFrame({"col": train_num.columns[:-1], "corr": corrs})
corr_df.sort_values("corr", ascending=False)

# total_days is strongest predictor

Unnamed: 0,col,corr
3,total_days,0.37
2,aspect_ratio,0.119
1,image_category,0.029
0,obs_hour,0.002


In [70]:
train_num.columns

Index(['obs_hour', 'image_category', 'aspect_ratio', 'total_days',
       'stats_downloads'],
      dtype='object')

In [71]:
# Columns to drop
to_drop = ['obs_hour', 'obs_min', 'obs_sec', 'width', 'height', 'iso', 'aperture_value', 'focal_length', 'latitude',
       'longitude', 'total_days', 'stats_downloads']

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Initial XGBoost testing
data = train.select_dtypes(include=np.number)
X = data.drop(columns="stats_downloads")
y = data["stats_downloads"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)

In [73]:
X

Unnamed: 0,obs_hour,image_category,aspect_ratio,total_days
0,15,1,1.500,2361
1,21,1,0.670,1255
2,4,1,0.670,1138
3,23,1,1.780,1579
4,11,1,1.500,1050
...,...,...,...,...
12192,13,1,0.730,1265
12193,20,1,0.720,1503
12194,1,1,1.500,2501
12195,18,1,1.500,2508


In [74]:
y

0        167503
1          1839
2           787
3          2049
4          2982
          ...  
12192      2353
12193      3541
12194     14581
12195      9247
12196      1497
Name: stats_downloads, Length: 12121, dtype: int64

In [82]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers

model = Sequential()

# Input layer
model.add(Dense(256, activation='relu', input_dim=X_train.shape[1]))

# Hidden layers
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Output layer
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa00e518df0>

In [83]:
# Evaluate model on test set
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_rmse = np.sqrt(test_loss)

print('Test RMSE: {:.4f}'.format(test_rmse))


Test RMSE: 21913.0872


In [97]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit and transform training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [98]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers

model = Sequential()

# Input layer
model.add(Dense(256, activation='relu', input_dim=X_train.shape[1]))

# Hidden layers
model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Output layer
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f9f82054670>

In [100]:
# Evaluate model on test set
test_loss = model.evaluate(X_test, y_test, verbose=0)
test_rmse = np.sqrt(test_loss)

print('Test RMSE: {:.4f}'.format(test_rmse))


Test RMSE: 21993.5780
