In [34]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [35]:
train_data = np.genfromtxt("train.csv", delimiter=",")

In [36]:
test_data = np.genfromtxt("test.csv", delimiter=",")

In [37]:
train_data.shape

(379, 14)

In [38]:
test_data.shape

(127, 13)

In [39]:
X = train_data[:, :13]
Y = train_data[:, 13]

In [40]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [41]:
train_data = np.column_stack((X_scaled, Y))

In [42]:
X.shape

(379, 13)

In [43]:
Y.shape

(379,)

In [44]:
def scale_test_data(test_data, scaler):
    return scaler.transform(test_data)

In [45]:
scaled_X = scale_test_data(X, scaler)

In [46]:
scaled_test_data = scale_test_data(test_data, scaler)

In [47]:
# This function finds the new gradient at each step for multiple features
def step_gradient(points, learning_rate, m, c):
    m_slope = np.zeros_like(m)
    c_slope = 0
    M = len(points)

    for i in range(M):
        x = points[i, :-1]  # Exclude the last column which is the target variable
        y = points[i, -1]   # Target variable

        m_slope += (-2/M) * (y - np.dot(m, x) - c) * x
        c_slope += (-2/M) * (y - np.dot(m, x) - c)

    new_m = m - learning_rate * m_slope
    new_c = c - learning_rate * c_slope
    return new_m, new_c

In [48]:
# The Gradient Descent Function for multiple features
def gd(points, learning_rate, num_iterations):
    m = np.zeros(points.shape[1] - 1)  # Initialize weights for each feature
    c = 0
    for i in range(num_iterations):
        m, c = step_gradient(points, learning_rate, m, c)
        print(i, "Cost : ", cost(points, m, c))
    return m, c

In [49]:
# This function finds the new cost after each optimization for multiple features
def cost(points, m, c):
    total_cost = 0
    M = len(points)

    for i in range(M):
        x = points[i, :-1]  # Exclude the last column which is the target variable
        y = points[i, -1]   # Target variable

        total_cost += (1/M) * ((y - np.dot(m, x) - c)**2)
    return total_cost

In [50]:
def run():
    data = scaled_X
    learning_rate = 0.01
    num_iterations = 1000
    m, c = gd(data, learning_rate, num_iterations)
    print("Weights (m):", m)
    print("Intercept (c):", c)
    return m, c

In [51]:
run()

0 Cost :  0.8951209071565263
1 Cost :  0.8112681127370357
2 Cost :  0.7440751114141769
3 Cost :  0.6900875225436258
4 Cost :  0.6465723566910327
5 Cost :  0.6113671741206748
6 Cost :  0.5827607914997678
7 Cost :  0.5593989382530421
8 Cost :  0.5402096441595582
9 Cost :  0.5243442312664739
10 Cost :  0.5111306463819749
11 Cost :  0.5000365530522607
12 Cost :  0.4906401417881894
13 Cost :  0.4826070442503708
14 Cost :  0.4756720747450377
15 Cost :  0.4696247894048884
16 Cost :  0.46429806460061
17 Cost :  0.4595590631315911
18 Cost :  0.45530208881683976
19 Cost :  0.4514429345553256
20 Cost :  0.44791441152682826
21 Cost :  0.44466281252923856
22 Cost :  0.44164511410984764
23 Cost :  0.4388267630042405
24 Cost :  0.43617992470709693
25 Cost :  0.43368209755184445
26 Cost :  0.4313150158842141
27 Cost :  0.42906378189619776
28 Cost :  0.4269161783257121
29 Cost :  0.424862124222646
30 Cost :  0.4228932438867101
31 Cost :  0.4210025253340127
32 Cost :  0.41918404959322664
33 Cost :  0.41

(array([ 0.11934211,  0.02198045,  0.09153026, -0.05528189,  0.16497196,
        -0.40897173,  0.30528339,  0.06771261,  0.00333266, -0.01564392,
         0.02671131, -0.13386579]),
 -4.091345318091297e-17)

In [52]:
def predict(data, m, c):
    predictions = []
    for i in range(len(data)):
        x_test = data[i, :-1]  
        y_pred = np.dot(m, x_test) + c
        predictions.append(y_pred)
    return np.array(predictions)

In [53]:
m = [ 0.11934211,  0.02198045,  0.09153026, -0.05528189,  0.16497196,
        -0.40897173,  0.30528339,  0.06771261,  0.00333266, -0.01564392,
         0.02671131, -0.13386579]
c = -4.091345318091297e-17

predictions = predict(scaled_X, m, c)
print(predictions)

[-5.86660699e-01 -4.72061201e-01  1.18509110e+00  6.29557004e-01
 -4.16363275e-01 -1.05848887e+00  2.58540349e-02  8.03871048e-01
 -4.25430783e-03 -1.29862455e+00 -4.09799631e-01 -1.42685418e-01
 -1.36219494e+00  6.18255315e-01  3.61551716e-01  4.34066212e-01
 -2.85810118e-01  2.21907551e-02  7.06178382e-02 -1.21139881e+00
 -3.14724183e-01 -8.46556611e-01 -1.22380660e+00 -4.67208940e-01
  8.56758069e-01  2.03819588e-01  8.99315713e-01 -8.12693654e-01
 -1.24084373e+00 -5.74741365e-01  3.30245585e-01  4.03939432e-01
 -1.14516072e+00  3.37073877e-01 -9.45886410e-01  2.14573083e+00
  1.32025742e+00  1.02221268e+00 -1.04146994e+00 -3.78126338e-01
  2.61211022e-01  5.14229513e-01 -1.81548777e-01  2.33972194e+00
  5.66134978e-01  6.87261996e-01 -2.87158144e-01 -3.23210898e-01
  1.46015509e+00  8.43580270e-01  7.92130346e-01  9.09773950e-02
  1.00410283e-01 -1.30678067e+00 -1.21825667e+00  1.01715299e+00
 -8.46532578e-01  4.70828886e-01 -9.31100246e-01 -9.63039428e-01
  1.12478403e+00  3.48895

In [54]:
def run():
    data = scaled_test_data
    learning_rate = 0.01
    num_iterations = 1000
    m, c = gd(data, learning_rate, num_iterations)
    print("Weights (m):", m)
    print("Intercept (c):", c)
    return m, c

In [55]:
run()

0 Cost :  0.7868969507434438
1 Cost :  0.7021910361791839
2 Cost :  0.6336562225405072
3 Cost :  0.5780895574292135
4 Cost :  0.5329262687600278
5 Cost :  0.4961124596689048
6 Cost :  0.46600320506782666
7 Cost :  0.44128098104639585
8 Cost :  0.42089036979545325
9 Cost :  0.4039857923631133
10 Cost :  0.3898896696260817
11 Cost :  0.3780589306088453
12 Cost :  0.36805820251660243
13 Cost :  0.35953834922143
14 Cost :  0.35221929098974497
15 Cost :  0.3458762511978499
16 Cost :  0.340328746245274
17 Cost :  0.3354317713228235
18 Cost :  0.33106874391154556
19 Cost :  0.3271458543135248
20 Cost :  0.3235875424946698
21 Cost :  0.32033287653467013
22 Cost :  0.3173326528166346
23 Cost :  0.3145470739790982
24 Cost :  0.31194388938148304
25 Cost :  0.30949690582984873
26 Cost :  0.30718479471685
27 Cost :  0.3049901364638212
28 Cost :  0.3028986549468934
29 Cost :  0.30089860402958507
30 Cost :  0.2989802758811116
31 Cost :  0.29713560680856166
32 Cost :  0.29535786117302587
33 Cost :  0.

(array([ 1.08034750e-01,  1.29765030e-01,  8.19783746e-02,  1.54370555e-02,
        -1.65073647e-01, -4.72426575e-01,  4.57389608e-01,  1.13076135e-04,
         1.56187014e-01,  2.72133324e-02,  7.80576700e-02, -3.70242727e-02]),
 -0.018832917771753385)

In [56]:
def hyperparameter_tuning(train_data, test_data, learning_rates, num_iterations):
    best_m = None
    best_c = None
    best_cost = float('inf')

    for lr in learning_rates:
        for num_iter in num_iterations:
            m, c = gd(train_data, lr, num_iter)
            current_cost = cost(train_data, m, c)

            print("Learning Rate:", lr, "Num Iterations:", num_iter)
            print("Weights (m):", m)
            print("Intercept (c):", c)
            print("Training Cost:", current_cost)

            # Evaluate on test data
            test_cost = cost(test_data, m, c)
            print("Testing Cost:", test_cost)

            if test_cost < best_cost:
                best_cost = test_cost
                best_m = m
                best_c = c

    return best_m, best_c

# Example of trying different combinations
learning_rates_to_try = [0.01, 0.1, 0.5]
num_iterations_to_try = [500, 1000, 1500]

best_m, best_c = hyperparameter_tuning(scaled_X, scaled_test_data, learning_rates_to_try, num_iterations_to_try)

# Example of predicting with the best combination
predictions = predict(scaled_test_data, best_m, best_c)
print(predictions)


0 Cost :  0.8951209071565263
1 Cost :  0.8112681127370357
2 Cost :  0.7440751114141769
3 Cost :  0.6900875225436258
4 Cost :  0.6465723566910327
5 Cost :  0.6113671741206748
6 Cost :  0.5827607914997678
7 Cost :  0.5593989382530421
8 Cost :  0.5402096441595582
9 Cost :  0.5243442312664739
10 Cost :  0.5111306463819749
11 Cost :  0.5000365530522607
12 Cost :  0.4906401417881894
13 Cost :  0.4826070442503708
14 Cost :  0.4756720747450377
15 Cost :  0.4696247894048884
16 Cost :  0.46429806460061
17 Cost :  0.4595590631315911
18 Cost :  0.45530208881683976
19 Cost :  0.4514429345553256
20 Cost :  0.44791441152682826
21 Cost :  0.44466281252923856
22 Cost :  0.44164511410984764
23 Cost :  0.4388267630042405
24 Cost :  0.43617992470709693
25 Cost :  0.43368209755184445
26 Cost :  0.4313150158842141
27 Cost :  0.42906378189619776
28 Cost :  0.4269161783257121
29 Cost :  0.424862124222646
30 Cost :  0.4228932438867101
31 Cost :  0.4210025253340127
32 Cost :  0.41918404959322664
33 Cost :  0.41

  total_cost += (1/M) * ((y - np.dot(m, x) - c)**2)


248 Cost :  inf
249 Cost :  inf
250 Cost :  inf
251 Cost :  inf
252 Cost :  inf
253 Cost :  inf
254 Cost :  inf
255 Cost :  inf
256 Cost :  inf
257 Cost :  inf
258 Cost :  inf
259 Cost :  inf
260 Cost :  inf
261 Cost :  inf
262 Cost :  inf
263 Cost :  inf
264 Cost :  inf
265 Cost :  inf
266 Cost :  inf
267 Cost :  inf
268 Cost :  inf
269 Cost :  inf
270 Cost :  inf
271 Cost :  inf
272 Cost :  inf
273 Cost :  inf
274 Cost :  inf
275 Cost :  inf
276 Cost :  inf
277 Cost :  inf
278 Cost :  inf
279 Cost :  inf
280 Cost :  inf
281 Cost :  inf
282 Cost :  inf
283 Cost :  inf
284 Cost :  inf
285 Cost :  inf
286 Cost :  inf
287 Cost :  inf
288 Cost :  inf
289 Cost :  inf
290 Cost :  inf
291 Cost :  inf
292 Cost :  inf
293 Cost :  inf
294 Cost :  inf
295 Cost :  inf
296 Cost :  inf
297 Cost :  inf
298 Cost :  inf
299 Cost :  inf
300 Cost :  inf
301 Cost :  inf
302 Cost :  inf
303 Cost :  inf
304 Cost :  inf
305 Cost :  inf
306 Cost :  inf
307 Cost :  inf
308 Cost :  inf
309 Cost :  inf
310 Cost

  m_slope += (-2/M) * (y - np.dot(m, x) - c) * x
  c_slope += (-2/M) * (y - np.dot(m, x) - c)


495 Cost :  nan
496 Cost :  nan
497 Cost :  nan
498 Cost :  nan
499 Cost :  nan
Learning Rate: 0.5 Num Iterations: 500
Weights (m): [nan nan nan nan nan nan nan nan nan nan nan nan]
Intercept (c): nan
Training Cost: nan
Testing Cost: nan
0 Cost :  10.176840230795724
1 Cost :  201.95389453625134
2 Cost :  4147.114888963956
3 Cost :  85299.64500306793
4 Cost :  1754617.5211646105
5 Cost :  36092698.21509193
6 Cost :  742431378.314562
7 Cost :  15271907718.219557
8 Cost :  314145080970.0086
9 Cost :  6462004205408.178
10 Cost :  132924247044767.27
11 Cost :  2734268640313140.5
12 Cost :  5.6244253126233464e+16
13 Cost :  1.1569514286516992e+18
14 Cost :  2.3798637796025537e+19
15 Cost :  4.895410013940371e+20
16 Cost :  1.00699205601549e+22
17 Cost :  2.0713954459191393e+23
18 Cost :  4.260886734650221e+24
19 Cost :  8.764698117535054e+25
20 Cost :  1.802909532112426e+27
21 Cost :  3.708607800739634e+28
22 Cost :  7.628653337692371e+29
23 Cost :  1.5692236783592596e+31
24 Cost :  3.227913

In [60]:
np.savetxt("predictions.csv", predictions, delimiter=",", fmt='%f')
