- Goals: Accurately determine a pet photo’s appeal and even suggest improvements to give these rescue animals a higher chance of loving homes.
- https://www.kaggle.com/c/petfinder-pawpularity-score/overview

In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv("train.csv") # train
test = pd.read_csv("test.csv")

In [3]:
df.drop(columns = 'Id', inplace=True)
test.drop(columns = 'Id', inplace=True)

In [4]:
df.head(2)

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0,1,1,0,0,0,0,0,0,0,0,0,42


In [5]:
X = df.copy()
y = X.pop('Pawpularity')


# Train test split using train data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y,
    test_size=0.2, 
    random_state=10)

#### Regression model

#### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score


logreg = LogisticRegression(penalty='l2', solver='liblinear')
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)

0.02773575390821987

In [17]:
model_cv = LogisticRegressionCV(Cs=np.logspace(-4, 4, 20), 
                                solver='liblinear', penalty='l2', cv=5)
model_cv.fit(X_train, y_train)
model_cv.score(X_test, y_test)



0.024205748865355523

In [27]:
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error as MSE

# Predict the model
pred = logreg.predict(X_test)
  
# RMSE Computation
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  22.539012


### Model Evaluation

In [38]:
# Model Evaluation metrics 
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
print('Accuracy Score : ' + str(accuracy_score(y_test,pred)))
print('Precision Score : ' + str(precision_score(y_test,pred, average='micro')))
print('Recall Score : ' + str(recall_score(y_test,pred, average='micro')))
print('F1 Score : ' + str(f1_score(y_test,pred, average='micro')))

# #Dummy Classifier Confusion matrix
# from sklearn.metrics import confusion_matrix
# print('Confusion Matrix : \n' + str(confusion_matrix(y_test,pred)))

Accuracy Score : 0.02773575390821987
Precision Score : 0.02773575390821987
Recall Score : 0.02773575390821987
F1 Score : 0.02773575390821987


### GridSearch

In [21]:
from sklearn.model_selection import GridSearchCV
logreg.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [43]:
# Set up the parameters.
# Use a list with 'l1' and 'l2' for the penalties,
# Use a list with 'liblinear' for the solver,
# Use a logspace from -3 to 0, with 50 different values

# fill the dictionary of parameters
gs_params = {'penalty': ['l1', 'l2'],
             'solver': ['liblinear'],
             'C': np.logspace(-3, 0, 50)}

# create your gridsearch object
lr_gridsearch = GridSearchCV(logreg,
                             gs_params,
                             n_jobs=2, 
                             cv=5, 
                             verbose=1)


# fit your gridsearch object on your training data
lr_gridsearch.fit(X_train, y_train)

# find the best hyperparameters that your gridsearch found:
lr_gridsearch.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits




{'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}

In [44]:
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error as MSE

# Predict the model
pred = lr_gridsearch.predict(X_test)
  
# RMSE Computation
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  21.690425
