In [40]:
import numpy as np
import pandas as pd

# make data

In [123]:
%%writefile gen_data.py
import numpy as np
import pandas as pd
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--data', '-d', help='specify data amount')
args = parser.parse_args()
if args.data:
    n = int(args.data)
else:
    n = 1000

X = np.random.randn(n,4)
noise = np.random.randn(n)/20
y = np.sum(X * [1,2,3,4], axis=1) + noise
df = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4'])
df['target'] = y
for col in df.columns:
    df[col] = "'"+df[col].astype(str)+"'"

df.to_csv('data.csv')

Overwriting gen_data.py


# clean data

In [120]:
%%writefile clean_data.py
# %load clean_data.py
import pandas as pd

df = pd.read_csv('data.csv', index_col=0)
print('Original Data format (note data is strings):')
print(df)
print('Turn strings into ints and write out cleaned_data.csv')
for col in df.columns:
    df[col] = df[col].str.replace("'", "").astype(float)
print(df)
df.to_csv('cleaned_data.csv')

Overwriting clean_data.py


# model

In [121]:
%%writefile train.py
# %load model.py
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import dump
import json

df = pd.read_csv('cleaned_data.csv', index_col=0)
train = df[:-2]
valid = df[-2:]
y_train = train['target']
X_train = train.drop('target', axis=1)
y_valid = valid['target']
X_valid = valid.drop('target', axis=1)

reg = LinearRegression().fit(X_train,y_train)
y_pred = reg.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Coefficients: {reg.coef_}')
      
# save model
dump(reg, 'linear_regressor.joblib')

# write metrics
with open('mse.json', 'w+') as f:
    json.dump({'mse':mse}, f)

Overwriting train.py


In [107]:
reg.coef_

array([0.99844104, 1.00035375, 0.99836077, 0.99975128])

In [108]:
df

Unnamed: 0,x1,x2,x3,x4,target
0,-1.705263,0.392149,1.400464,-0.136336,-0.089926
1,-0.574122,1.483284,-0.573647,-1.504295,-1.244547
2,-1.342138,-2.760512,1.755665,0.904369,-1.436583
3,1.237557,1.488433,1.890893,0.000345,4.546877
4,0.327471,0.899484,-0.697811,1.922400,2.509720
...,...,...,...,...,...
995,0.982858,2.208287,0.288221,0.324280,3.757028
996,-2.385402,1.086120,-1.535742,1.892268,-0.952748
997,0.795285,0.311871,1.475341,0.668045,3.233518
998,0.224303,-0.226151,-0.021580,-0.458110,-0.441979


In [19]:
train

Unnamed: 0_level_0,A,B,C,D,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/1/2020,1,2,3,4,10
1/2/2020,10,4,20,5,40
1/3/2020,5,100,0,0,103
1/4/2020,5,15,5,200,227


In [16]:
X

Unnamed: 0_level_0,A,B,C,D
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1/1/2020,1,2,3,4
1/2/2020,10,4,20,5
1/3/2020,5,100,0,0
1/4/2020,5,15,5,200
1/5/2020,300,200,10,10
1/6/2020,18,12,17,2
