In [380]:
import numpy as np
import pandas as pd

*** DATA PREPROCESSING ***

In [398]:
train_a = pd.read_csv('Data\data.csv')
train_b = pd.read_csv('Data\data2.csv')

merged = [train_a, train_b]
train = pd.concat(merged)
train.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [399]:
from sklearn.preprocessing import LabelEncoder

def data_preprocessing(data):
    # Drop irrelevant collumns
    le = LabelEncoder()

    data.drop(['date', 'statezip','country', 'street', 'city', 'waterfront'], axis= 1, inplace= True)
    
    data.loc[(data.yr_built >= 1900) & (data.yr_built < 1950), 'yr_built'] = 1
    data.loc[(data.yr_built >= 1950) & (data.yr_built < 1970), 'yr_built'] = 2
    data.loc[(data.yr_built >= 1970) & (data.yr_built < 1990), 'yr_built'] = 3
    data.loc[(data.yr_built >= 1900) & (data.yr_built < 2010), 'yr_built'] = 4
    data.loc[(data.yr_built >= 2010), 'yr_built'] = 5
    
    data.loc[(data.yr_renovated >= 1900) & (data.yr_renovated < 1950), 'yr_renovated'] = 1
    data.loc[(data.yr_renovated >= 1950) & (data.yr_renovated < 1970), 'yr_renovated'] = 2
    data.loc[(data.yr_renovated >= 1970) & (data.yr_renovated < 1990), 'yr_renovated'] = 3
    data.loc[(data.yr_renovated >= 1900) & (data.yr_renovated <= 2014), 'yr_renovated'] = 4
    
     return data
    

*** Prepare Data ***

In [400]:
train = data_preprocessing(train)


In [401]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



X = train.drop(['price'], axis= 1)
y = train['price']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 42)


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)




*** MODELING ***

In [402]:
class LinearRegression:
    def __init__(self, learning_rate = 0.01, iterations = 500,):
        self.weight = None
        self.bias = None
        self.learning_rate = learning_rate
        self.iterations = iterations

    def fit(self, X, y):
        m, n = X.shape
        self.weight = np.zeros(n)
        self.bias = 0

        for _ in range(self.iterations):
            y_hat = np.dot(X, self.weight) + self.bias

            dw =  np.dot(X.T,(y_hat - y)) / m
            db =  np.sum(y_hat - y) / m

            self.weight = self.weight - (self.learning_rate * dw)
            self.bias = self.bias - (self.learning_rate * db)

        
    def predict(self, X):
        y_pred = np.dot(X, self.weight) + self.bias
        return y_pred

    def score(self, y_pred, y):
        loss = np.mean((y_pred - y) ** 2)
        return loss
    

In [403]:
# Training step
model = LinearRegression(learning_rate= 0.1, iterations= 1000)

model.fit(X_train_scaled, y_train)

In [404]:
# Make a prediction on train and val sets
y_preds_train = model.predict(X_train_scaled)
y_preds_test = model.predict(X_test_scaled)


In [408]:
# Review the model performance

training_score = model.score(y_preds_train, y_train)
test_score = model.score(y_preds_test, y_test)

print(f'Training score: {training_score}')
print(f"Test score:     {test_score}")


Training score: 86583384836.59053
Test score:     632885196254.1545


In [416]:
prediction = model.predict(X_test_scaled)[3]
formatted_prediction = f"{prediction:.3f}"
print(formatted_prediction)


918884.474


In [417]:
y_test[3]

736500.0

In [411]:
import joblib

joblib.dump(model, 'linear_regression_model.pkl')

['linear_regression_model.pkl']