In [182]:
import pandas as pd
import numpy as np

In [183]:
train = pd.read_csv('train_mod.csv')
test = pd.read_csv('test_mod.csv')

In [184]:
pd.set_option('display.max_columns', 15)
train.columns

Index(['pet_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category', 'pet_category',
       'days_stayed', 'total_days_stayed', 'total_hours_stayed',
       'condition_is_missing'],
      dtype='object')

In [185]:
## removed columns and rows not to be used for training

filt = train['condition_is_missing'] != 1
train = train[filt]
train.drop(columns=['pet_id', 'issue_date', 'days_stayed', 'condition_is_missing'], inplace=True)

In [186]:
## removed times from listing dates and set it as index

train['listing_date'] = train['listing_date'].apply(lambda x: x.split(" ")[0])
train.set_index('listing_date', inplace=True)

In [187]:
## looking for sig correlations between days stayed and other/newly made columns

train['l/h'] = train['length(m)']/train['height(cm)']
train['X1/X2'] = train['X1']/train['X2']
train.corr()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category,total_days_stayed,total_hours_stayed,l/h,X1/X2
condition,1.0,-0.011219,-0.010793,0.338843,0.381696,-0.483503,-0.04166,-0.101837,-0.115996,0.003851,0.020888
length(m),-0.011219,1.0,-0.001461,-0.004543,-0.012465,0.006317,-0.007712,0.002831,-0.003292,0.563482,0.005378
height(cm),-0.010793,-0.001461,1.0,-0.006197,-0.007746,0.012808,-0.005288,-0.00085,0.008358,-0.603322,0.002912
X1,0.338843,-0.004543,-0.006197,1.0,0.608291,0.024412,-0.249709,-0.063172,-0.356665,0.004289,0.506579
X2,0.381696,-0.012465,-0.007746,0.608291,1.0,-0.022245,0.008162,0.14693,-0.293872,-0.001791,-0.068663
breed_category,-0.483503,0.006317,0.012808,0.024412,-0.022245,1.0,-0.04452,-0.022957,0.004687,-0.004144,0.032495
pet_category,-0.04166,-0.007712,-0.005288,-0.249709,0.008162,-0.04452,1.0,0.172713,0.067444,0.001016,-0.213931
total_days_stayed,-0.101837,0.002831,-0.00085,-0.063172,0.14693,-0.022957,0.172713,1.0,-0.048918,0.005256,-0.074332
total_hours_stayed,-0.115996,-0.003292,0.008358,-0.356665,-0.293872,0.004687,0.067444,-0.048918,1.0,-0.015585,-0.190352
l/h,0.003851,0.563482,-0.603322,0.004289,-0.001791,-0.004144,0.001016,0.005256,-0.015585,1.0,0.009751


In [188]:
# remove null values produced from newly made columns
train = train.dropna()

# drop columns with less than 0.01 correlation to pet category
train.drop(columns=['l/h', 'length(m)', 'height(cm)', 'X2'], inplace=True)

In [189]:
## separate y column from training set
train_output = train['pet_category']
train.drop(columns='pet_category', inplace=True)


# separate the names of x column(s) in training df that are numerical from categorical and num with low correlations
train_num = train.drop('color_type', axis=1)
num_attribs = list(train_num)

cat_attribs = ['color_type']

In [196]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# transforms all columns appropriately
transformer = ColumnTransformer([('num', numeric_transformer, num_attribs), ('cat', OneHotEncoder(), cat_attribs),])
train_prepared = transformer.fit_transform(train)

train_prepared

<17353x61 sparse matrix of type '<class 'numpy.float64'>'
	with 121471 stored elements in Compressed Sparse Row format>

In [197]:
## training and evaluating on training set (used linear reg but can try other models)

from sklearn.linear_model import LinearRegression

X = train_prepared
y = train_output

lin_reg = LinearRegression()
fitted_training = lin_reg.fit(X, y)

In [198]:
## measure RMSE to see model's level of performance

from sklearn.metrics import mean_squared_error

train_predictions = lin_reg.predict(train_prepared)
lin_mse = mean_squared_error(train_output, train_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.3712236047807087

In [199]:
## better evaluation via cross-validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, train_prepared, train_output, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("SD:", scores.std())

display_scores(lin_rmse_scores)

Scores: [0.36871292 0.3566106  0.37059711 0.34582686 0.37743271 0.39247142
 0.38456292 0.36421803 0.36410885 0.41112263]
Mean: 0.3735664065118961
SD: 0.017804307355380886


In [200]:
## find best hyperparameters via grid search

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1).fit(train_prepared, train_output)
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)

grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1, refit=True)
grid_result = grid.fit(train_prepared, train_output)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    0.5s finished


In [201]:
print('Best Score: ', grid_result.best_score_) # R squared score (binary, closest to 1 = better)
print('Best Params: ', grid_result.best_params_)

Best Score:  0.5228494263749333
Best Params:  {'alpha': 1}
