In [40]:
import numpy as np
import pandas as pd

# make data

In [129]:
%%writefile gen_data.py
import numpy as np
import pandas as pd
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--data', '-d', help='specify data amount')
parser.add_argument('--coef', '-c', help='specify data amount')
args = parser.parse_args()
if args.data:
    n = int(args.data)
else:
    n = 1000
if args.coef:
    coef = args.coef.split(',')
    coef = [int(x) for x in coef]
else:
    coef = [1,2,3,4]

X = np.random.randn(n,4)
noise = np.random.randn(n)/20
y = np.sum(X * coef, axis=1) + noise
df = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4'])
df['target'] = y
for col in df.columns:
    df[col] = "'"+df[col].astype(str)+"'"

df.to_csv('data.csv')

Overwriting gen_data.py


# clean data

In [120]:
%%writefile clean_data.py
# %load clean_data.py
import pandas as pd

df = pd.read_csv('data.csv', index_col=0)
print('Original Data format (note data is strings):')
print(df)
print('Turn strings into ints and write out cleaned_data.csv')
for col in df.columns:
    df[col] = df[col].str.replace("'", "").astype(float)
print(df)
df.to_csv('cleaned_data.csv')

Overwriting clean_data.py


# model

In [137]:
%%writefile train.py
# %load model.py
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import dump
import json

df = pd.read_csv('cleaned_data.csv', index_col=0)
train = df[:-2]
valid = df[-2:]
y_train = train['target']
X_train = train.drop('target', axis=1)
y_valid = valid['target']
X_valid = valid.drop('target', axis=1)

reg = LinearRegression().fit(X_train,y_train)
y_pred = reg.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Coefficients: {reg.coef_}')
      
# save model
dump(reg, 'linear_regressor.joblib')

# write metrics
with open('mse.json', 'w+') as f:
    json.dump({'mse':mse}, f)
with open('coefs.json', 'w+') as f:
    json.dump({'coefs':reg.coef_.tolist()}, f)    

Overwriting train.py


# check model

In [131]:
%%writefile check_coeffs.py
from joblib import load

reg = load('linear_regressor.joblib')
print(reg.coef_)

Overwriting check_coeffs.py


In [136]:
reg.coef_ += np.random.randn(4)/20