In [1]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import chart_studio.plotly as py
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)

ModuleNotFoundError: No module named 'chart_studio.plotly'

In [None]:
def DrawMissing(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [None]:
# y = mx + b
class gradient:
    def __init__(self, m, b):
        self.m = m
        self.b = b
    def value(self, x):
        return self.m * x + self.b
    
def cost_function(x, y, gradient):
    sum = 0
    total_sample = x.shape[0]
    for i in range(total_sample):
        real = y[i]
        hypothesis = gradient.value(x[i])
        sum += (hypothesis - real)**2
    return np.sum(sum / (2 * total_sample))

def JmDerivative(x, y, gradient):
    sum = 0
    total_sample = x.shape[0]
    for i in range(total_sample):
        real = y[i]
        hypothesis = gradient.value(x[i])
        sum += (hypothesis - real) * x[i]
    return np.sum(sum / total_sample)

def JbDerivative(x, y, gradient):
    sum = 0
    total_sample = x.shape[0]
    for i in range(total_sample):
        real = y[i]
        hypothesis = gradient.value(x[i])
        sum += hypothesis - real
    return np.sum(sum / total_sample)

In [None]:
def LinearRegression(x, y, m=0, mStep=0.1, mTry=1000, b=0, bStep=0.1, bTry=1000): 
    costBest = sys.maxsize
    gradientBest = None
    for i in range(mTry):
        for j in range(bTry):
            mTest = m + mStep * i
            bTest = b + bStep * j
            result = cost_function(x, y, gradient(mTest, bTest))
            if (result < costBest):
                costBest = result
                gradientBest = gradient(mTest, bTest)
                # 0 is the best cost we can get, so return anyway
                if (result == 0): return gradientBest
    return gradientBest

def BatchGD(x, y, m, b, learningRate, iter):
    mList = []
    bList = []
    cList = []
    for i in range(iter):
        derivative_cost_m = JmDerivative(x, y, gradient(m, b))
        derivative_cost_b = JbDerivative(x, y, gradient(m, b))
        m = m - learningRate * derivative_cost_m
        b = b - learningRate * derivative_cost_b
        c = cost_function(x, y, gradient(m, b))
        mList.append(m)
        bList.append(b)
        cList.append(c)
    return mList, bList, cList

def StochasticGD(x, y, m, b, learningRate):
    
    def CostDerivative(x1, y1, m, b, biasDerivative=False):
        hypothesis = gradient(m, b).value(x1)
        if (biasDerivative):
            return np.sum(hypothesis - y1)
        return np.sum((hypothesis - y1) * x1)
    
    mList = []
    bList = []
    cList = []
    total_sample = x.shape[0]
    for i in range(total_sample):
        derivative_m = CostDerivative(x[i], y[i], m, b, False)
        derivative_b = CostDerivative(x[i], y[i], m, b, True)
        m = m - learningRate * derivative_m
        b = b - learningRate * derivative_b
        c = cost_function(x, y, gradient(m, b))
        mList.append(m)
        bList.append(b)
        cList.append(c)
    return mList, bList, cList

In [None]:
datacsv  = pd.read_csv('input/house/train.csv')
dataset  = pd.DataFrame(datacsv, columns = ['OverallCond'])
targets  = pd.DataFrame(datacsv, columns = ['SalePrice'])

missing_dataset = DrawMissing(dataset)
display(missing_dataset)

In [None]:
print(dataset.head())
print(targets.head())

targets = targets.to_numpy()
dataset = dataset.to_numpy()

### Regresi Metode Statistika 

#### Paramaters for Guessing Gradient

In [None]:
mStart = 10
mStep  = 0.5
mIter  = 100
bStart = 50000
bStep  = 10000
bIter  = 100

In [None]:
best = LinearRegression(dataset, targets, mStart, mStep, mIter, bStart, bStep, bIter)

minIndex = np.argmin(dataset)
maxIndex = np.argmax(dataset)

xGrad = [dataset[minIndex], dataset[maxIndex]]
yGrad = [best.value(dataset[minIndex]), best.value(dataset[maxIndex])]

print("Cost: ", cost_function(dataset, targets, best))
print("m: ", best.m)
print("b: ", best.b)

plt.scatter(dataset, targets, label='Data')
plt.plot(xGrad, yGrad, color='Red', label='y = mx + b')
plt.xlabel("X (Input)")
plt.ylabel("Y (Output)")
plt.legend()
plt.show()

#### Parameters for Using Gradient Descent

In [None]:
learningRate = 0.01
iteration = 100
mStart = 10
bStart = 50000

### Regresi Metode Batch Gradient Descent

In [None]:
m, b, cost = BatchGD(dataset, targets, mStart, bStart, learningRate, iteration)

minIndex = np.argmin(cost)
print("Minimum Cost: ", cost[minIndex])
print("m: ", m[minIndex])
print("b: ", b[minIndex])

ax = plt.axes(projection='3d')
ax.plot3D(m, b, cost, 'Red')
ax.set_xlabel('Weight (m)')
ax.set_ylabel('Bias (b)')
ax.set_zlabel('Cost (J)')
ax.set_title('Batch Gradient Descent')
plt.show()

### Regresi Stokastik Gradient Descent

In [None]:
m, b, cost = StochasticGD(dataset, targets, mStart, bStart, learningRate)

minIndex = np.argmin(cost)
print("Minimum Cost: ", cost[minIndex])
print("m: ", m[minIndex])
print("b: ", b[minIndex])

ax = plt.axes(projection='3d')
ax.plot3D(m, b, cost, 'Red')
ax.set_xlabel('Weight (m)')
ax.set_ylabel('Bias (b)')
ax.set_zlabel('Cost (J)')
ax.set_title('Batch Gradient Descent')
plt.show()