Combined Cycle Power Plant dataset contains 9568 data points collected from a Combined Cycle Power Plant over 6 years (2006-2011), when the power plant was set to work with full load. Features consist of hourly average ambient variables Temperature (T), Ambient Pressure (AP), Relative Humidity (RH) and Exhaust Vacuum (V) to predict the net hourly electrical energy output (EP) of the plant.

Your task is to:
1. Code Gradient Descent for N features and come with predictions.
2. Try and test with various combinations of learning rates and number of iterations.
3. Try using Feature Scaling, and see if it helps you in getting better results. 

Read Instructions carefully -
1. Use Gradient Descent as a training algorithm and submit results predicted.
2. Files are in csv format, you can use genfromtxt function in numpy to load data from csv file. Similarly you can use savetxt function to save data into a file.
3. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions. Also predictions shouldn't be in exponential form.
4. Your score is based on coefficient of determination. So it can be possible that nobody gets full score.

You are given:
    1. A Readme file for more details on dataset. 
    2. A Training dataset csv file with X train and Y train data
    3. A X test File and you have to predict and submit predictions for this file.
    


# Import Data

In [10]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [52]:
train_data = np.loadtxt(r'C:\Users\DELL\Desktop\CodingNinjas\Gradient Descent TASK-2\training_ccpp_x_y_train.csv',delimiter=',')
eval_data  = np.loadtxt(r'C:\Users\DELL\Desktop\CodingNinjas\Gradient Descent TASK-2\test_ccpp_x_test.csv',delimiter=',')

In [53]:
train_data.shape

(7176, 5)

In [54]:
eval_data.shape

(2392, 4)

# Split train_data into input and output values

In [55]:
x = train_data[:, 0:4]
y = train_data[:, 4]

In [56]:
print(x.shape)
print(y.shape)

(7176, 4)
(7176,)


# Multivariate feature inclusion

In [57]:
#The data is not normalized, so do normalization
scaler = preprocessing.StandardScaler()
scaler.fit(x)
x = scaler.transform(x)
eval_data = scaler.transform(eval_data)

In [58]:
#adding features (self multiplication)
#in x 
columns = x.shape[1]
for i in range(columns):
    new_col = (x[:, i] * x[:, i]).reshape(-1,1)
    x = np.append(x, new_col, axis =1)

#in eval data
columns = eval_data.shape[1]
for i in range(columns):
    new_col = (eval_data[:, i] * eval_data[:, i]).reshape(-1,1)
    eval_data = np.append(eval_data, new_col, axis =1)
    
#adding ones row(constant) for using gradient descent on multiple features.
#in x
x_ones = np.ones(x.shape[0]).reshape(-1,1)
x = np.append(x, x_ones, axis=1)

#in eval data
eval_ones = np.ones(eval_data.shape[0]).reshape(-1,1)
eval_data = np.append(eval_data, eval_ones, axis=1)

In [59]:
print(x.shape)
print(y.shape)
print(eval_data.shape)
print(x_ones.shape)
print(eval_ones.shape)

(7176, 9)
(7176,)
(2392, 9)
(7176, 1)
(2392, 1)


In [60]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y)

In [61]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5382, 9)
(1794, 9)
(5382,)
(1794,)


# Create dataframes

In [62]:
df = pd.DataFrame(x)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.478275,-1.247642,1.302211,0.758625,2.185296,1.55661,1.695753,0.575511,1.0
1,0.289012,0.306798,0.661749,-0.446922,0.083528,0.094125,0.437912,0.199739,1.0
2,-0.399976,-0.421013,-0.287207,0.375011,0.15998,0.177252,0.082488,0.140633,1.0
3,1.572001,1.33577,-0.686238,-0.879086,2.471186,1.784283,0.470923,0.772792,1.0
4,-1.395329,-1.118236,0.634923,1.311822,1.946942,1.250451,0.403128,1.720877,1.0


In [63]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0
mean,-1.942302e-15,-2.070379e-14,-1.89569e-13,-1.273294e-14,1.0,1.0,1.0,1.0,1.0
std,1.00007,1.00007,1.00007,1.00007,0.9735049,0.7471808,1.431425,1.231853,0.0
min,-2.383992,-2.268772,-3.415745,-3.262827,1.489307e-09,8.632078e-07,2.584714e-07,1.582746e-07,1.0
25%,-0.8240709,-0.9841242,-0.7130639,-0.6888175,0.2142339,0.4358869,0.1072206,0.1325467,1.0
50%,0.09168063,-0.1755335,-0.05918944,0.1107203,0.6726207,0.9607976,0.4824987,0.5668106,1.0
75%,0.8147832,0.9608855,0.6772576,0.7965757,1.564605,1.373247,1.29714,1.401805,1.0
max,2.159312,2.138871,3.3594,1.838351,5.68342,5.147325,11.66731,10.64604,1.0


In [64]:
df.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

# Gradient Descent for n features

In [65]:
def step_gradient(X_train, Y_train, learning_rate, m,j):
    # Calculate new slope for jth feature
    m_j = 0
    n_data_pts = X_train.shape[0]
    N = len(m)
    for i in range(n_data_pts):
        # calculate the formula m1xi(1)+m2xi(2)+...
        x_i = X_train[i, :]
        y_i = Y_train[i]
        temp_sum = 0
        for k in range(N):
            temp_sum += m[k]*x_i[k]
        ### sub y_i from temp sum
        temp_sum = y_i - temp_sum
        ## complete formula
        m_j += (-2/n_data_pts) * (temp_sum) * x_i[j]
    # update m[j] and return
    m[j] = m[j] - (learning_rate*m_j)
    return m[j]

In [66]:
def gd(x_train, y_train, learning_rate, num_iterations):
    m = [0]* (x_train.shape[1])
    N = len(m)
    #put c=1
    m[-1] = 1
    
    for i in range(num_iterations):
        for j in range(N):
            m[j] = step_gradient(x_train, y_train, learning_rate, m , j)
        print(i, 'Cost : ', cost(x_train, y_train, m))
    return m

In [67]:
def cost(X_train, Y_train, m):
        # This will calculate mean square error
    cost = 0
    n_data_pts = len(X_train)
    N = len(m)
    for i in range(n_data_pts):
        x_i = X_train[i, :]
        y_i = Y_train[i]
        temp_sum = 0
        for k in range(N):
            temp_sum += m[k]*x_i[k]
        temp_sum = y_i - temp_sum
        cost += (1/n_data_pts) * ((temp_sum)**2)
    return cost

In [68]:
def pred(x_test, m):
    reshape_train = x_train.shape[1]
    reshape_m = np.array(m).reshape(reshape_train, 1)
    return np.dot(x_test, reshape_m)

In [69]:
def score(y_truth, y_pred):
    u = ((y_truth - y_pred)**2).sum()
    v = ((y_truth - y_truth.mean())**2).sum()
    return 1 - u/v

In [70]:
def run():
    
    learning_rate = 0.1
    num_iterations = 1000
    #m = gd(x_train, y_train, learning_rate, num_iterations)
    m = gd(x, y, learning_rate, num_iterations)
    print('M : ', m)
    
    #to check training score (not evaluation score)
    #y_train_pred = pred(x_train, m)
    #s = score(y_train, y_train_pred)
    #print(s)
    #to print y_pred
    #y_pred = pred(x_test, m)
    #o check test score 
    #print(score(y_test, y_pred))
    #print(y_pred)
    y_output = pred(eval_data, m)
    print(y_output)
    np.savetxt('gd_ccpp_output.csv',y_output ,fmt ='%.5f', delimiter = ',')

In [71]:
run()

0 Cost :  52338.033081515605
1 Cost :  43197.62394956025
2 Cost :  37629.56496380207
3 Cost :  32891.450592174486
4 Cost :  28904.878231257637
5 Cost :  25534.57048431052
6 Cost :  22660.234412378013
7 Cost :  20188.728414659097
8 Cost :  18048.507092135536
9 Cost :  16183.822720810611
10 Cost :  14550.541696952232
11 Cost :  13113.234344242444
12 Cost :  11843.119197725578
13 Cost :  10716.576491243623
14 Cost :  9714.04928268471
15 Cost :  8819.215683460148
16 Cost :  8018.355456999344
17 Cost :  7299.858992257581
18 Cost :  6653.842455807589
19 Cost :  6071.8433137076445
20 Cost :  5546.577442356551
21 Cost :  5071.743927574031
22 Cost :  4641.867117213207
23 Cost :  4252.168002133725
24 Cost :  3898.458846735637
25 Cost :  3577.056367016418
26 Cost :  3284.70979226139
27 Cost :  3018.5409367305824
28 Cost :  2775.9940141479246
29 Cost :  2554.793396479213
30 Cost :  2352.9078829661426
31 Cost :  2168.520330433912
32 Cost :  2000.0017199697359
33 Cost :  1845.8889120698645
34 Cost :

273 Cost :  18.595934745195557
274 Cost :  18.59371113993732
275 Cost :  18.591567453661316
276 Cost :  18.58950081392652
277 Cost :  18.587508451538884
278 Cost :  18.58558769683899
279 Cost :  18.583735976124938
280 Cost :  18.58195080820186
281 Cost :  18.58022980105717
282 Cost :  18.578570648654352
283 Cost :  18.576971127842608
284 Cost :  18.57542909537725
285 Cost :  18.573942485047283
286 Cost :  18.572509304906955
287 Cost :  18.571127634605546
288 Cost :  18.569795622814045
289 Cost :  18.56851148474445
290 Cost :  18.567273499757643
291 Cost :  18.566080009058037
292 Cost :  18.564929413470168
293 Cost :  18.56382017129574
294 Cost :  18.56275079624806
295 Cost :  18.56171985546015
296 Cost :  18.56072596756475
297 Cost :  18.559767800842682
298 Cost :  18.55884407143905
299 Cost :  18.55795354164293
300 Cost :  18.557095018227844
301 Cost :  18.55626735085426
302 Cost :  18.555469430526745
303 Cost :  18.55470018810904
304 Cost :  18.553958592890808
305 Cost :  18.55324365

541 Cost :  18.53406978568604
542 Cost :  18.534069663549065
543 Cost :  18.53406954580183
544 Cost :  18.53406943228652
545 Cost :  18.534069322851007
546 Cost :  18.5340692173488
547 Cost :  18.534069115638296
548 Cost :  18.534069017583462
549 Cost :  18.53406892305275
550 Cost :  18.534068831919548
551 Cost :  18.534068744061745
552 Cost :  18.534068659361637
553 Cost :  18.534068577705707
554 Cost :  18.53406849898455
555 Cost :  18.534068423092734
556 Cost :  18.534068349928514
557 Cost :  18.53406827939383
558 Cost :  18.534068211394185
559 Cost :  18.53406814583866
560 Cost :  18.534068082639223
561 Cost :  18.534068021711114
562 Cost :  18.534067962972944
563 Cost :  18.53406790634585
564 Cost :  18.534067851753825
565 Cost :  18.53406779912397
566 Cost :  18.53406774838571
567 Cost :  18.534067699471134
568 Cost :  18.534067652314384
569 Cost :  18.534067606852627
570 Cost :  18.53406756302474
571 Cost :  18.534067520772094
572 Cost :  18.534067480037937
573 Cost :  18.534067

809 Cost :  18.534066387593597
810 Cost :  18.53406638758689
811 Cost :  18.53406638758041
812 Cost :  18.534066387574228
813 Cost :  18.534066387568213
814 Cost :  18.53406638756238
815 Cost :  18.534066387556756
816 Cost :  18.534066387551384
817 Cost :  18.53406638754622
818 Cost :  18.53406638754126
819 Cost :  18.534066387536445
820 Cost :  18.534066387531766
821 Cost :  18.534066387527297
822 Cost :  18.534066387522927
823 Cost :  18.534066387518806
824 Cost :  18.53406638751479
825 Cost :  18.534066387510897
826 Cost :  18.53406638750708
827 Cost :  18.534066387503504
828 Cost :  18.534066387500122
829 Cost :  18.53406638749671
830 Cost :  18.534066387493375
831 Cost :  18.534066387490267
832 Cost :  18.534066387487396
833 Cost :  18.53406638748452
834 Cost :  18.53406638748165
835 Cost :  18.534066387478994
836 Cost :  18.534066387476436
837 Cost :  18.534066387473892
838 Cost :  18.534066387471494
839 Cost :  18.534066387469153
840 Cost :  18.534066387466925
841 Cost :  18.534