In [31]:
import numpy as np
import pandas as pd

In [32]:
# load data 
train = pd.read_csv('../data/train.csv', index_col=0)

In [33]:
# display data 
train.head()

Unnamed: 0,X1,X2,X3,X4,Y
0,,1.056681,-0.702197,1.197854,1.293048
1,0.119143,0.238021,-1.155493,0.855867,2.570588
2,-0.100293,0.192111,-1.56852,,2.183867
3,1.047916,-0.615148,0.589949,-1.428586,1.522509
4,0.406713,-0.897186,-1.246735,-0.08767,1.839782


In [34]:
# features, target 
features = train.drop('Y', axis=1)
target = train.Y

In [35]:
# process pipeline ... imputer strategy with scaling data processing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('std_scaler', StandardScaler()),
 ])

transformed_features = pipeline.fit_transform(features)

In [36]:
transformed_features

array([[ 0.0708254 ,  1.61877069, -0.66105539,  1.81250389],
       [ 0.3885856 ,  0.54784565, -1.25085433,  1.37830771],
       [ 0.10128381,  0.48778873, -1.78825725,  0.07682661],
       ...,
       [ 0.0708254 ,  1.46760266,  0.88217752,  0.04431769],
       [-0.15390253, -1.69486135,  0.0136262 ,  0.57229673],
       [ 0.15062826,  0.21474924,  0.8717424 ,  0.87426099]])

In [37]:
# train model
from sklearn.linear_model import LinearRegression 
lin_reg = LinearRegression()
lin_reg.fit(transformed_features, target)

LinearRegression()

In [38]:
# dump model 
import joblib
joblib.dump(lin_reg, '../ml-package/models/model.pkl')

['../ml-package/models/model.pkl']

In [39]:
# predict on real dataset
test = pd.read_csv('../data/test.csv', index_col=0)

In [40]:
test.head()

Unnamed: 0,X1,X2,X3,X4,Y
800,-0.886586,0.020074,0.803749,-0.227414,1.701998
801,0.08417,-1.28032,-2.337387,-0.305632,1.447935
802,0.637529,0.523434,0.465955,-1.598957,4.519572
803,0.156267,0.002001,-0.446239,0.084282,1.723813
804,0.631777,0.792945,-1.359653,-0.089434,3.944611


In [41]:
features = test.drop('Y', axis=1)
target = test.Y

In [42]:
# process pipeline
transformed_features = pipeline.fit_transform(features)

In [43]:
# load model and make predictions
model = joblib.load('../ml-package/models/model.pkl')
predictions = model.predict(transformed_features)

In [44]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(target, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

1.7936184215760074

Great !! We have our model, we can push forward to next level