Imports first.

In [75]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

Next, we will load the challenge1 csv into a dataframe.
TODO: save the df to postgresql

In [2]:
df = pd.read_csv("challenge1.csv")
df

Unnamed: 0,loc1,loc2,para1,dow,para2,para3,para4,price
0,0,01,1,Mon,662,3000.0,3.8,73.49
1,9,99,1,Thu,340,2760.0,9.2,300.00
2,0,04,0,Mon,16,2700.0,3.0,130.00
3,4,40,1,Mon,17,12320.0,6.4,365.00
4,5,50,1,Thu,610,2117.0,10.8,357.50
...,...,...,...,...,...,...,...,...
9995,9,98,3,Fri,386,5000.0,12.0,460.00
9996,7,74,1,Thu,386,3250.0,8.0,325.00
9997,0,06,0,Tue,190,8856.0,5.6,133.33
9998,7,74,3,Fri,717,5000.0,13.6,820.00


I could just be missing it, but I don't know an easy way to put a numeric value on the categorical variable for the day of the week. In case there's a seasonality I'll appreciate the numeric data.

In [11]:
dow_numeric = []
for value in df['dow']:
    if value == 'Mon':
        dow_numeric.append(0)
    elif value == 'Tue':
        dow_numeric.append(1)
    elif value == 'Wed':
        dow_numeric.append(2)
    elif value == 'Thu':
        dow_numeric.append(4)
    elif value == 'Fri':
        dow_numeric.append(5)
    elif value == 'Sat':
        dow_numeric.append(6)
    elif value == 'Sun':
        dow_numeric.append(7)
    else:
        dow_numeric.append(pd.NA)
df.insert(4, 'dow_numeric', dow_numeric)
df

Unnamed: 0,loc1,loc2,para1,dow,dow_numeric,para2,para3,para4,price
0,0,01,1,Mon,0,662,3000.0,3.8,73.49
1,9,99,1,Thu,4,340,2760.0,9.2,300.00
2,0,04,0,Mon,0,16,2700.0,3.0,130.00
3,4,40,1,Mon,0,17,12320.0,6.4,365.00
4,5,50,1,Thu,4,610,2117.0,10.8,357.50
...,...,...,...,...,...,...,...,...,...
9995,9,98,3,Fri,5,386,5000.0,12.0,460.00
9996,7,74,1,Thu,4,386,3250.0,8.0,325.00
9997,0,06,0,Tue,1,190,8856.0,5.6,133.33
9998,7,74,3,Fri,5,717,5000.0,13.6,820.00


Next I just `describe()` the data.

In [12]:
df.describe()

Unnamed: 0,para1,dow_numeric,para2,para3,para4,price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.3808,2.3831,447.384,9547.98992,8.459522,433.733056
std,3.500831,1.828953,221.005861,8022.814037,4.613526,277.435947
min,0.0,0.0,16.0,200.0,1.0,50.73
25%,1.0,1.0,301.0,2898.0,4.0,250.0
50%,1.0,2.0,434.0,6447.0,7.2,370.0
75%,1.0,4.0,582.0,15000.0,13.6,550.0
max,337.0,7.0,2554.0,34782.0,27.2,5700.0


I noticed that the mean of the dow_numeric column looked 'low' to me, so I wanted to explore that a little deeper.

In [32]:
df.groupby(by=['dow']).size()

dow
Fri    1931
Mon    1918
Sat      10
Sun       3
Thu    1908
Tue    1997
Wed    2233
dtype: int64

Never mind that, Wednesday is a little overrepresented but the main reason for the low mean is there was barely any data on Saturday and Sunday specifically, which makes sense because those aren't usually business days. Doesn't look like I need to care about seasonality.

Instead, I'll go ahead and build a linear regression model from the four parameters to predict the price. Using train_test_split so I don't have to initialize the four sets individually and don't have to worry about calculating how many records I should use. Does everything I need in just one line!

In [43]:
# split the data set into the features and regressand

df_regressors = df[['para1', 'para2', 'para3', 'para4']]
df_regressand = df['price']

# divide the data sets into train and test sets
df_regressors_train, df_regressors_test, df_regressand_train, df_regressand_test = train_test_split(df_regressors, df_regressand, test_size=0.2)

In [64]:
reg = linear_model.LinearRegression()
reg.fit(df_regressors_train, df_regressand_train)

df_regressand_prediction = reg.predict(df_regressors_test)


In [73]:
r2_error = r2_score(df_regressand_test, df_regressand_prediction)

print(f"Coefficients: {reg.coef_}")
print(f"Mean Squared Error: {mean_squared_error(df_regressand_test, df_regressand_prediction):.2f}")
print(f"R Squared Error: {r2_error*100:.2f}%")
print(f"Adjusted R Squared Error: {(1-(1-r2_error)*((2000-1)/(2000-4-1)))*100:.2f}%")

Coefficients: [ 2.65417073e+00  6.86104929e-01 -1.53541046e-04  3.08434234e+01]
Mean Squared Error: 26010.79
R Squared Error: 58.05%
Adjusted R Squared Error: 57.97%
