In [444]:
import pandas as pd
import numpy as np

In [445]:
train = pd.read_csv('train_mod.csv')
test = pd.read_csv('test_mod.csv')

In [446]:
pd.set_option('display.max_columns', 15)
train.columns

Index(['pet_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category', 'pet_category',
       'days_stayed', 'total_days_stayed', 'total_hours_stayed',
       'condition_is_missing'],
      dtype='object')

In [447]:
## removed columns and rows not to be used for training

filt = train['condition_is_missing'] != 1
train = train[filt]
train.drop(columns=['pet_id', 'issue_date', 'days_stayed', 'condition_is_missing'], inplace=True)

In [448]:
## removed times from listing date and set as index

train['listing_date'] = train['listing_date'].apply(lambda x: x.split(" ")[0])
train.set_index('listing_date', inplace=True)

In [449]:
## looking for sig correlations between days stayed and other columns; made new columns

train['days_and_hours'] = train['total_days_stayed'] + train['total_hours_stayed']
train['l/h'] = train['length(m)']/train['height(cm)']
train['X1/X2'] = train['X1']/train['X2']
train.corr()

train = train.dropna()

In [450]:
## turns color types to numerical values but goes by order of array

#train_color = train[['color_type']]

# from sklearn.preprocessing import OrdinalEncoder
# ordinal_encoder = OrdinalEncoder()
# color_encoded = ordinal_encoder.fit_transform(train_color)
# color_encoded[:10]

In [451]:
## turns (categorical) color types to (numerical) binary values

# from sklearn.preprocessing import OneHotEncoder

# train_cat = train[['color_type']]

# cat_encoder = OneHotEncoder()
# train_cat_1hot = cat_encoder.fit_transform(train_cat)
# train_cat_1hot

In [452]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

## separate x and y columns of training set
train_output = train['days_and_hours']
train.drop(columns='days_and_hours', inplace=True)


## data in training df that is numerical
train_num = train.drop('color_type', axis=1)

# num and cat attribute column names as separate lists
num_attribs = list(train_num)
cat_attribs = ['color_type']

## transforms all columns appropriately
prepped_columns = ColumnTransformer([('num', StandardScaler(), num_attribs), ('cat', OneHotEncoder(), cat_attribs),])
train_prepared = prepped_columns.fit_transform(train)

train_prepared

<17353x66 sparse matrix of type '<class 'numpy.float64'>'
	with 208236 stored elements in Compressed Sparse Row format>

In [453]:
## training and evaluating on training set

from sklearn.linear_model import LinearRegression

X = train_prepared
y = train_output

lin_reg = LinearRegression()
lin_reg.fit(X, y)

LinearRegression()

In [455]:
## measure RMSE to see model's level of performance

from sklearn.metrics import mean_squared_error

train_predictions = lin_reg.predict(train_prepared)
lin_mse = mean_squared_error(train_output, train_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse



9.841680036472838e-05

In [None]:
## better evaluation via cross-validation