In [65]:
import pandas as pd
import numpy as np

In [66]:
train = pd.read_csv('train_mod.csv')
test = pd.read_csv('test_mod.csv')

In [67]:
pd.set_option('display.max_columns', 15)
train.columns

Index(['pet_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category', 'pet_category',
       'days_stayed', 'total_days_stayed', 'total_hours_stayed',
       'condition_is_missing'],
      dtype='object')

In [68]:
## removed columns and rows not to be used for training

filt = train['condition_is_missing'] != 1
train = train[filt]
train.drop(columns=['pet_id', 'issue_date', 'days_stayed', 'condition_is_missing'], inplace=True)

In [69]:
## removed times from listing dates and set it as index

train['listing_date'] = train['listing_date'].apply(lambda x: x.split(" ")[0])
train.set_index('listing_date', inplace=True)

In [70]:
## looking for sig correlations between days stayed and other columns; made new columns

train['l/h'] = train['length(m)']/train['height(cm)']
train['X1/X2'] = train['X1']/train['X2']
train.corr()

train = train.dropna() # remove null values produced from newly made columns

In [71]:
## separate y column from training set
train_output = train['breed_category']
train.drop(columns='breed_category', inplace=True)


# separate the names of x column(s) in training df that are numerical from categorical
train_num = train.drop('color_type', axis=1)
num_attribs = list(train_num)

cat_attribs = ['color_type']

['condition',
 'color_type',
 'length(m)',
 'height(cm)',
 'X1',
 'X2',
 'pet_category',
 'total_days_stayed',
 'total_hours_stayed',
 'l/h',
 'X1/X2']

In [72]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# transforms all columns appropriately
transformer = ColumnTransformer([('num', StandardScaler(), num_attribs), ('cat', OneHotEncoder(), cat_attribs),])
train_prepared = transformer.fit_transform(train)

train_prepared

<17353x65 sparse matrix of type '<class 'numpy.float64'>'
	with 190883 stored elements in Compressed Sparse Row format>

In [73]:
## training and evaluating on training set (used linear reg but can try other models)

from sklearn.linear_model import LinearRegression

X = train_prepared
y = train_output

lin_reg = LinearRegression()
fitted_training = lin_reg.fit(X, y)

In [74]:
## measure RMSE to see model's level of performance

from sklearn.metrics import mean_squared_error

train_predictions = lin_reg.predict(train_prepared)
lin_mse = mean_squared_error(train_output, train_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.40929661972913733

In [75]:
## better evaluation via cross-validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, train_prepared, train_output, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("SD:", scores.std())

display_scores(lin_rmse_scores)

Scores: [0.41075554 0.41835256 0.40659299 0.40397401 0.40948521 0.39686723
 0.41383624 0.40526686 0.4179961  0.42657302]
Mean: 0.4109699772810049
SD: 0.00810743198966674


In [76]:
## find best hyperparameters via grid search

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1).fit(train_prepared, train_output)
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(alpha=alpha)

grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1, refit=True)
grid_result = grid.fit(train_prepared, train_output)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    0.6s finished


In [77]:
print('Best Score: ', grid_result.best_score_) # R squared score (binary, 1 is better)
print('Best Params: ', grid_result.best_params_)

Best Score:  0.3231885134960345
Best Params:  {'alpha': 10}
