In [31]:
import os
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

In [32]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

In [33]:
# Step 1: Drop the duplicate datapoints and after checking again we see that there is no duplicate in our data
train_df.drop_duplicates()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907,ffbfa0383c34dc513c95560d6e1fdb57,0,0,0,1,0,0,0,0,0,0,0,1,15
9908,ffcc8532d76436fc79e50eb2e5238e45,0,1,1,1,0,0,0,0,0,0,0,0,70
9909,ffdf2e8673a1da6fb80342fa3b119a20,0,1,1,1,0,0,0,0,1,1,0,0,20
9910,fff19e2ce11718548fa1c5d039a5192a,0,1,1,1,0,0,0,0,1,0,0,0,20


In [34]:
# Step 2: Drop missing data, we can see that there are no missing data
train_df.dropna()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907,ffbfa0383c34dc513c95560d6e1fdb57,0,0,0,1,0,0,0,0,0,0,0,1,15
9908,ffcc8532d76436fc79e50eb2e5238e45,0,1,1,1,0,0,0,0,0,0,0,0,70
9909,ffdf2e8673a1da6fb80342fa3b119a20,0,1,1,1,0,0,0,0,1,1,0,0,20
9910,fff19e2ce11718548fa1c5d039a5192a,0,1,1,1,0,0,0,0,1,0,0,0,20


In [35]:
# Step 3: Handling invalid data, we can see that there is no invalid data
columns = train_df.columns

for c in columns[1:]:
    if c == "Pawpularity":
        train_df = train_df[(train_df[c] >= 0) & (train_df[c] <= 100)]
    else:
        train_df = train_df[(train_df[c] == 0) | (train_df[c] == 1)]

train_df

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907,ffbfa0383c34dc513c95560d6e1fdb57,0,0,0,1,0,0,0,0,0,0,0,1,15
9908,ffcc8532d76436fc79e50eb2e5238e45,0,1,1,1,0,0,0,0,0,0,0,0,70
9909,ffdf2e8673a1da6fb80342fa3b119a20,0,1,1,1,0,0,0,0,1,1,0,0,20
9910,fff19e2ce11718548fa1c5d039a5192a,0,1,1,1,0,0,0,0,1,0,0,0,20


In [36]:
def rmse(model, X, y):
    y_pred = model.predict(X)
    rmse = math.sqrt(mean_squared_error(y, y_pred))
    return rmse

In [37]:
# select the columns for X and y
X = train_df.iloc[:, 1:-1]
y = train_df['Pawpularity']

x_test = test_df.iloc[:, 1:]
df_test = pd.read_csv("Data/sample_submission.csv")
y_test = df_test['Pawpularity']

# split the data into training set and test set
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=12)
train_x.head(1)


Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur
4181,0,0,0,1,0,0,0,0,0,0,0,0


In [38]:
# Linear Model
model = Lasso()
model.fit(train_x, train_y)
rmse_score = rmse(model, test_x,test_y)
print('Lasso rmse score: ', rmse_score)

model = Ridge()
model.fit(train_x, train_y)
rmse_score = rmse(model, test_x,test_y)

print('Ridge rmse score: ', rmse_score)

Lasso rmse score:  20.199854013373898
Ridge rmse score:  20.211953681716945


In [39]:
# Ensemble with RandomForestRegressor
rf = RandomForestRegressor(n_estimators=150,max_depth=3)
rf.fit(train_x, train_y)
rmse_score = rmse(rf, test_x, test_y)
print(rmse_score)

20.19619973475943


In [40]:
# Ensemble with VotingRegressor
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor()
svm = SVR()
er = VotingRegressor([('knn', knn), ('dt', dt), ('svm', svm)])
er.fit(X, y)
rmse_score = rmse(er, test_x, test_y)
print(rmse_score)

20.554692879205167
