In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import bernoulli

In [2]:

np.random.seed(123)
#Training
training_x1 = np.random.normal(3,1,10000)
training_x2 = np.random.normal(-2,1,10000)
training_x3 = training_x1 + 2*training_x2
training_x4 = (training_x2 + 2)**2
training_x5 = bernoulli(0.8).rvs(10000)
training_xd = np.random.normal(0,0.1,10000)
#Testing
testing_x1 = np.random.normal(3,1,1000)
testing_x2 = np.random.normal(-2,1,1000)
testing_x3 = testing_x1 + 2*testing_x2
testing_x4 = (testing_x2 + 2)**2
testing_x5 = bernoulli(0.8).rvs(1000)
testing_xd = np.random.normal(0,0.1,1000)
# Y
training_y = 4 - (3 * (training_x1 ** 2)) + training_x3 - (0.01 * training_x4) + training_x2 * training_x5 + training_xd
testing_y =  4 - (3 * (testing_x1 ** 2)) + testing_x3 - (0.01 * testing_x4) + testing_x2 * testing_x5 + testing_xd



In [3]:

training = np.empty((10000,11))
testing = np.empty((1000,11))
training[:,0] = training_x1
training[:,1] = training_x2
training[:,2] = training_x3
training[:,3] = training_x4
training[:,4] = training_x5
testing[:,0] = testing_x1
testing[:,1] = testing_x2
testing[:,2] = testing_x3
testing[:,3] = testing_x4
testing[:,4] = testing_x5
for i in range(5,10):
    training[:,i] = np.random.normal(0,1,10000)
    testing[:,i] = np.random.normal(0,1,1000)
training[:,10] = training_y
testing[:,10] = testing_y

training_df = pd.DataFrame(training)
testing_df = pd.DataFrame(testing)

In [4]:


def find_error(data,c,d):
    #Error:
    err = 0
    for i in range(len(data)):
        err += (data.iloc[i,d] - c)**2
    err = err/len(data)
    return err

def find_var_to_split(data,d):
    max_corr = 0
    max_col = 0
    
    # Selecting the variable to split on based on pearson correlation coefficient:
    for i in range(d):
        corr = pearsonr(data.loc[:,i],data.loc[:,d])
        if corr[0] > max_corr:
            max_corr = corr[0]
            max_col = i
    return max_col

# Splitting recursively on Xi and forming the decision tree:
def build_decision_tree(data, d , depth,max_depth,depth_and_error_dict):
    if depth >= max_depth:
        return
    
    
    #Find variable to split on:
    i = find_var_to_split(data,d)
    
    #Find the threshold:
    sorted_xi = sorted(data.loc[:,i])
    threshold_index = len(sorted_xi) // 2
    threshold = sorted_xi[threshold_index]
    
    #Split by threshold:
    left_branch = data[data.loc[:,i] < threshold]
    right_branch = data[data.loc[:,i] >= threshold]
    
    # Find avg and error:
    left_y_avg = left_branch.loc[:,d].mean()
    right_y_avg = right_branch.loc[:,d].mean()
    
    left_error = find_error(left_branch,left_y_avg,d)
    right_error = find_error(right_branch,right_y_avg,d)
    total_error = ((len(left_branch)/len(data))*left_error) + ((len(right_branch)/len(data))*right_error)
    
    depth += 1
    
    #Store error in the dictinary:
    depth_and_error_dict[depth].append(total_error)
    
    #print(f'Depth:{depth}')
    #print(f'Errors:\nLeft: {left_error}\nRight: {right_error}\nTotal: {total_error}')
    #print(f'Length:\nLeft: {len(left_branch)}\nRight: {len(right_branch)}')
    #print()
    
    #Recursively build trees for left and right branches:
    build_decision_tree(left_branch,d,depth,max_depth,depth_and_error_dict)
    build_decision_tree(right_branch,d,depth,max_depth,depth_and_error_dict)

In [5]:
from collections import defaultdict
# Builds tree based on data with d columns and max_depth:
def build_by_depth(d,max_depth):
    training_error = defaultdict(list)
    testing_error = defaultdict(list)
    build_decision_tree(pd.DataFrame(training),d,0,max_depth,training_error)
    build_decision_tree(pd.DataFrame(testing),d,0,max_depth,testing_error)
    return training_error,testing_error

In [6]:
#Building trees from depth 1 to 7 and looking at the results:
for k in range(1,8):
    print(f'Depth = {k}')
    training_error,testing_error = build_by_depth(10,k)
    print("Depth and Avg Error")
    print()
    print("Training Data")
    for i in range (11):
        if(len(training_error[i]) > 0):
            print(f'{i} : {sum(training_error[i])/len(training_error[i])}')
    print()
    print("Testing Data")
    for i in range (11):
        if(len(testing_error[i]) > 0):
            print(f'{i} : {sum(testing_error[i])/len(testing_error[i])}')  
    print()

Depth = 1
Depth and Avg Error

Training Data
1 : 311.0150916593469

Testing Data
1 : 327.6694689202362

Depth = 2
Depth and Avg Error

Training Data
1 : 311.0150916593469
2 : 309.4181424829448

Testing Data
1 : 327.6694689202362
2 : 324.6164397005692

Depth = 3
Depth and Avg Error

Training Data
1 : 311.0150916593469
2 : 309.4181424829448
3 : 308.5914450633599

Testing Data
1 : 327.6694689202362
2 : 324.6164397005692
3 : 321.5797463083014

Depth = 4




Depth and Avg Error

Training Data
1 : 311.0150916593469
2 : 309.4181424829448
3 : 308.5914450633599
4 : 308.0530120129407

Testing Data
1 : 327.6694689202362
2 : 324.6164397005692
3 : 321.5797463083014
4 : 336.0986924094814

Depth = 5
Depth and Avg Error

Training Data
1 : 311.0150916593469
2 : 309.4181424829448
3 : 308.5914450633599
4 : 308.0530120129407
5 : 307.2186595385773

Testing Data
1 : 327.6694689202362
2 : 324.6164397005692
3 : 321.5797463083014
4 : 336.0986924094814
5 : 329.94143854029755

Depth = 6
Depth and Avg Error

Training Data
1 : 311.0150916593469
2 : 309.4181424829448
3 : 308.5914450633599
4 : 308.0530120129407
5 : 307.2186595385773
6 : 305.6244640020033

Testing Data
1 : 327.6694689202362
2 : 324.6164397005692
3 : 321.5797463083014
4 : 336.0986924094814
5 : 329.94143854029755
6 : 300.7244714658371

Depth = 7
Depth and Avg Error

Training Data
1 : 311.0150916593469
2 : 309.4181424829448
3 : 308.5914450633599
4 : 308.0530120129407
5 : 307.2186595385773
6 : 305.62446

In [7]:
#Using the same code with slight modifications:
# Splitting recursively on Xi and forming the decision tree:
def build_decision_tree_by_size(data, d , min_sample_size,size_and_error_dict):
    if len(data) < min_sample_size:
        return
    
    
    #Find variable to split on:
    i = find_var_to_split(data,d)
    
    #Find the threshold:
    sorted_xi = sorted(data.loc[:,i])
    threshold_index = len(sorted_xi) // 2
    threshold = sorted_xi[threshold_index]
    
    #Split by threshold:
    left_branch = data[data.loc[:,i] < threshold]
    right_branch = data[data.loc[:,i] >= threshold]
    
    # Find avg and error:
    left_y_avg = left_branch.loc[:,d].mean()
    right_y_avg = right_branch.loc[:,d].mean()
    
    left_error = find_error(left_branch,left_y_avg,d)
    right_error = find_error(right_branch,right_y_avg,d)
    total_error = ((len(left_branch)/len(data))*left_error) + ((len(right_branch)/len(data))*right_error)
    
    
    
    #Store error in the dictinary:
    size_and_error_dict[len(data)].append(total_error)
    
    print(f'Errors:\nLeft: {left_error}\nRight: {right_error}\nTotal: {total_error}')
    #print(f'Length:\nLeft: {len(left_branch)}\nRight: {len(right_branch)}')
    print()
    
    #Recursively build trees for left and right branches:
    build_decision_tree_by_size(left_branch,d,min_sample_size,size_and_error_dict)
    build_decision_tree_by_size(right_branch,d,min_sample_size,size_and_error_dict)

In [8]:
training_by_size_error = defaultdict(list)
testing_by_size_error = defaultdict(list)
build_decision_tree_by_size(pd.DataFrame(training),10,20,training_by_size_error)
build_decision_tree_by_size(pd.DataFrame(training),10,20,testing_by_size_error)

Errors:
Left: 311.45842868069826
Right: 310.57175463799564
Total: 311.0150916593469

Errors:
Left: 323.5516823723265
Right: 295.06029584279077
Total: 309.3059891075586

Errors:
Left: 324.15163999222847
Right: 321.0057008248442
Total: 322.57867040853637

Errors:
Left: 337.4417720768104
Right: 308.7149518379009
Total: 323.0783619573557

Errors:
Left: 407.15232608187824
Right: 266.2905401626277
Total: 336.60874369351757

Errors:
Left: 458.51928251589413
Right: 347.6801094704903
Total: 403.0996959931922

Errors:
Left: 560.8431237731281
Right: 353.61514904486506
Total: 457.2291364089966

Errors:
Left: 650.7417211261326
Right: 451.097747208053
Total: 550.9197341670928

Errors:
Left: 675.0954398844012
Right: 521.0425780145131
Total: 596.0939722588175

Errors:
Left: 729.1217897644532
Right: 312.20271130092874
Total: 520.6622505326909

Errors:
Left: 519.4863540998218
Right: 373.5106355513217
Total: 444.6270112544371

Errors:
Left: 346.37123840881037
Right: 400.6260317324102
Total: 373.498635070

Total: 222.34776677460036

Errors:
Left: 327.25033205321785
Right: 473.7695317956261
Total: 400.50993192442195

Errors:
Left: 345.740380062907
Right: 297.71100046721847
Total: 321.1099289881949

Errors:
Left: 352.10969295179086
Right: 207.5441066950138
Total: 279.8268998234023

Errors:
Left: 305.94848728855095
Right: 626.323403066271
Total: 470.24331589250994

Errors:
Left: 937.8370199430162
Right: 163.76029756741613
Total: 550.7986587552161

Errors:
Left: 475.45879197518286
Right: 333.48842489448094
Total: 404.47360843483193

Errors:
Left: 509.57094361783794
Right: 433.07762107844525
Total: 471.3242823481416

Errors:
Left: 545.6333401559135
Right: 461.7849896930164
Total: 502.63418607237656

Errors:
Left: 552.6492710210741
Right: 301.726259730289
Total: 427.1877653756816

Errors:
Left: 545.2520936869497
Right: 280.0286721249552
Total: 409.2400826295166

Errors:
Left: 409.26139106663135
Right: 109.27467409949232
Total: 259.2680325830618

Errors:
Left: 360.4406951911459
Right: 305.02284

Total: 249.12071847241774

Errors:
Left: 77.51004107442068
Right: 131.2684589442094
Total: 117.82885447676222

Errors:
Left: 305.8580123459291
Right: 321.1190414879605
Total: 313.4885269169448

Errors:
Left: 204.22232822795263
Right: 401.3780348647388
Total: 305.3278188109199

Errors:
Left: 570.7952275128465
Right: 160.7295347760218
Total: 365.76238114443413

Errors:
Left: 452.27321553726466
Right: 189.4658125104134
Total: 317.5001883440076

Errors:
Left: 280.1582497423177
Right: 98.56704661360723
Total: 189.36264817796246

Errors:
Left: 336.4401025416195
Right: 215.74052808362504
Total: 276.0903153126223

Errors:
Left: 348.3683275727903
Right: 329.3663397223259
Total: 333.0205681551075

Errors:
Left: 337.1300116594514
Right: 311.00644039920275
Total: 323.86089609869015

Errors:
Left: 188.73713258711115
Right: 464.44100770074226
Total: 331.0359068393078

Errors:
Left: 247.3541920563012
Right: 364.01881170361247
Total: 305.68650187995684

Errors:
Left: 278.86160883256736
Right: 152.0139

Total: 240.6373165839208

Errors:
Left: 215.1274546775628
Right: 254.29714818696283
Total: 235.02317201567075

Errors:
Left: 249.88162671315447
Right: 174.0209024585086
Total: 210.72770451720822

Errors:
Left: 287.53143611226994
Right: 127.7692147149216
Total: 207.65032541359577

Errors:
Left: 342.4747317670125
Right: 148.0258302485597
Total: 243.70703575763963

Errors:
Left: 426.00926087962404
Right: 261.41152573525346
Total: 341.05559112769083

Errors:
Left: 132.31733188012896
Right: 161.0007631303324
Total: 146.65904750523066

Errors:
Left: 287.1479592289364
Right: 269.41696096007746
Total: 278.25413581612213

Errors:
Left: 316.5587565772456
Right: 247.6029361062633
Total: 282.08084634175447

Errors:
Left: 370.2700032056119
Right: 261.5557379540553
Total: 315.91287057983357

Errors:
Left: 464.5026839099158
Right: 279.98638647218115
Total: 369.87894163415444

Errors:
Left: 266.8839445343875
Right: 284.1545174543908
Total: 275.5192309943892

Errors:
Left: 337.38121367954216
Right: 130

Total: 286.877190475521

Errors:
Left: 231.03191987550335
Right: 141.0769556601977
Total: 186.05443776785052

Errors:
Left: 347.65002086192237
Right: 236.81349385841037
Total: 290.81077624473676

Errors:
Left: 262.0242103426801
Right: 210.70505345519013
Total: 236.3646318989351

Errors:
Left: 298.0123856257829
Right: 159.53876891073665
Total: 227.89916197259492

Errors:
Left: 191.4684777756409
Right: 388.5651005072275
Total: 292.5436689200443

Errors:
Left: 166.24570776015696
Right: 605.4341012224154
Total: 385.8399044912862

Errors:
Left: 214.7797645232224
Right: 65.42184320102378
Total: 140.1008038621231

Errors:
Left: 269.8651009802596
Right: 129.28959412239655
Total: 199.5773475513281

Errors:
Left: 103.84900938815018
Right: 50.29497068537471
Total: 61.005778425929805

Errors:
Left: 297.1252055467722
Right: 323.1459436672444
Total: 310.13557460700827

Errors:
Left: 291.50006099561625
Right: 302.69534326559545
Total: 297.1066583564218

Errors:
Left: 276.13635158802066
Right: 305.419

Total: 222.93445787248402

Errors:
Left: 302.05878518137956
Right: 82.3907043493596
Total: 192.22474476536956

Errors:
Left: 150.1111555656676
Right: 230.54296449942277
Total: 190.32706003254518

Errors:
Left: 308.3072025047926
Right: 313.62826138910907
Total: 310.96773194695083

Errors:
Left: 301.9373560598267
Right: 313.4928259055218
Total: 307.71509098267427

Errors:
Left: 312.0999393658663
Right: 291.6464889428428
Total: 301.8568513940161

Errors:
Left: 337.4040159264802
Right: 286.76109563588903
Total: 312.0825557811846

Errors:
Left: 411.6845798012308
Right: 257.8427802667194
Total: 334.7636800339751

Errors:
Left: 499.50324692821783
Right: 323.8631413403394
Total: 411.6831941342786

Errors:
Left: 800.96163266954
Right: 399.0829382092412
Total: 491.82417539238713

Errors:
Left: 386.95480177252614
Right: 374.01060988113505
Total: 380.4827058268306

Errors:
Left: 433.68844262692687
Right: 193.75590245219874
Total: 310.6461143321945

Errors:
Left: 294.1846076943
Right: 83.6248095271

Errors:
Left: 282.45349259098333
Right: 330.0019613576995
Total: 306.3036829947355

Errors:
Left: 268.2795591551183
Right: 295.23125879671517
Total: 281.75540897591674

Errors:
Left: 286.4917940001295
Right: 248.43818907015657
Total: 267.46499153514304

Errors:
Left: 285.43297475974623
Right: 239.73457693941904
Total: 261.9978989544502

Errors:
Left: 239.0765520631494
Right: 137.69180016147922
Total: 188.38417611231432

Errors:
Left: 218.30061154334678
Right: 273.9416885959473
Total: 246.83449721134704

Errors:
Left: 282.66641167089153
Right: 94.44292026460295
Total: 188.55466596774724

Errors:
Left: 376.84377818794445
Right: 191.26054425516608
Total: 284.05216122155525

Errors:
Left: 392.84628326970613
Right: 325.74520985666277
Total: 358.4354763912223

Errors:
Left: 307.7886973139281
Right: 215.6166900622994
Total: 261.70269368811375

Errors:
Left: 183.23055510472994
Right: 190.22326140266117
Total: 186.81655833443824

Errors:
Left: 137.74026878147214
Right: 238.0841716564484
Total: 

Total: 397.60245653015613

Errors:
Left: 226.53139428856215
Right: 349.14212139582105
Total: 287.8367578421916

Errors:
Left: 201.05157492747102
Right: 141.4290009594396
Total: 170.47589596950618

Errors:
Left: 90.0312071366751
Right: 190.59018011115722
Total: 140.31069362391617

Errors:
Left: 332.39489715481125
Right: 256.3409475376007
Total: 294.367922346206

Errors:
Left: 321.3288942091685
Right: 324.3692758125298
Total: 322.8880642621742

Errors:
Left: 295.5064608297409
Right: 353.0014433516611
Total: 324.253952090701

Errors:
Left: 308.20392020226683
Right: 194.24780283510287
Total: 249.76488565500324

Errors:
Left: 207.02260735887506
Right: 153.81505677234085
Total: 180.41883206560794

Errors:
Left: 261.01663429218155
Right: 280.38766114054033
Total: 270.70214771636097

Errors:
Left: 209.64929537822152
Right: 309.7371762019444
Total: 259.69323579008295

Errors:
Left: 157.88842294243815
Right: 256.77010025918446
Total: 208.59697541256446

Errors:
Left: 231.72847282084308
Right: 20

Total: 240.8236258372558

Errors:
Left: 215.70884417095704
Right: 129.4846895705902
Total: 172.59676687077362

Errors:
Left: 244.0299615109925
Right: 320.84849677684656
Total: 283.42408216014843

Errors:
Left: 364.75503170896076
Right: 202.08452952735053
Total: 283.41978061815564

Errors:
Left: 249.26896132079068
Right: 290.3921897041608
Total: 269.8305755124757

Errors:
Left: 275.0441992181541
Right: 198.4030562466068
Total: 236.72362773238046

Errors:
Left: 320.9482048371108
Right: 206.8981799742367
Total: 262.46101259973943

Errors:
Left: 218.14203484946907
Right: 175.20251326609224
Total: 196.67227405778067

Errors:
Left: 186.9761373462571
Right: 174.5131783596257
Total: 176.4305566652613

Errors:
Left: 140.56903434903492
Right: 198.50360787194487
Total: 170.41411767901883

Errors:
Left: 402.7500957192261
Right: 171.11427408146338
Total: 286.93218490034474

Errors:
Left: 539.4769441685751
Right: 254.52504861153574
Total: 393.3477669598369

Errors:
Left: 330.86270057546756
Right: 10

Errors:
Left: 302.44805483761144
Right: 299.51163206218615
Total: 300.9798434498988

Errors:
Left: 526.4833201938374
Right: 256.657576670471
Total: 301.62853392436534

Errors:
Left: 707.0868474055286
Right: 286.52743208602845
Total: 496.8071397457785

Errors:
Left: 257.8728864932649
Right: 253.17291342234464
Total: 255.52289995780478

Errors:
Left: 231.20740876181668
Right: 263.9989136976219
Total: 247.85540357537934

Errors:
Left: 258.8221493158292
Right: 182.73636587158796
Total: 220.7792575937086

Errors:
Left: 276.95203004263766
Right: 246.42510814430196
Total: 261.22603997379804

Errors:
Left: 304.4977581325475
Right: 191.22382101303737
Total: 246.98945159495005

Errors:
Left: 214.24272959541665
Right: 391.8295673513701
Total: 303.0361484733934

Errors:
Left: 195.84765590977037
Right: 186.8709464742651
Total: 191.22329044299494

Errors:
Left: 322.1279257170135
Right: 255.052799732119
Total: 288.59036272456626

Errors:
Left: 200.56550361395475
Right: 442.4475958300322
Total: 321.50

Total: 301.04369453909123

Errors:
Left: 413.75308291318356
Right: 154.29712415904893
Total: 280.6987450905504

Errors:
Left: 157.01857467037365
Right: 113.72563446092371
Total: 135.37210456564867

Errors:
Left: 359.66332578960964
Right: 241.33830113564932
Total: 298.98382596706585

Errors:
Left: 227.21675518112002
Right: 254.48044938923107
Total: 240.84860228517556

Errors:
Left: 227.98455891779668
Right: 304.54063558931773
Total: 266.2625972535572

Errors:
Left: 208.8159772515488
Right: 239.52206201525513
Total: 224.56268738678278

Errors:
Left: 223.52846483579907
Right: 207.12887130646732
Total: 215.3286680711332

Errors:
Left: 287.48739551342624
Right: 311.261172823056
Total: 299.6790761850312

Errors:
Left: 321.4024997217011
Right: 292.58031058343164
Total: 296.90363895417204

Errors:
Left: 257.3070077491038
Right: 281.9138687480145
Total: 269.6888040479187

Errors:
Left: 278.5414282197858
Right: 234.74488531925468
Total: 256.64315676952026

Errors:
Left: 310.39790533859644
Right:

Total: 273.97604427987505

Errors:
Left: 256.7797482759265
Right: 198.80524554843862
Total: 227.79249691218257

Errors:
Left: 248.7386800154965
Right: 176.51007256394783
Total: 211.69836850188176

Errors:
Left: 245.18075519439304
Right: 62.97599704362458
Total: 154.07837611900882

Errors:
Left: 26.69810428052752
Right: 167.9751331946306
Total: 97.78654558125454

Errors:
Left: 20.866473410070334
Right: 30.02956166819616
Total: 25.448017539133247

Errors:
Left: 19.69115802176021
Right: 20.959134120653633
Total: 20.634012044014295

Errors:
Left: 19.167949553155903
Right: 22.614540764583662
Total: 20.95066914527371

Errors:
Left: 29.49764258058551
Right: 30.437146621417412
Total: 29.97943952460187

Errors:
Left: 28.700120263420377
Right: 19.30059582358969
Total: 24.00035804350503

Errors:
Left: 168.63708006262533
Right: 167.02745733908864
Total: 167.82208121526497

Errors:
Left: 240.0189776895034
Right: 99.9695567387558
Total: 168.19876181732513

Errors:
Left: 151.49472088283878
Right: 31.

Total: 340.5336621585934

Errors:
Left: 176.11368985262976
Right: 230.39382721972007
Total: 203.25375853617493

Errors:
Left: 218.69819782356663
Right: 281.17837325420066
Total: 250.73931342901994

Errors:
Left: 376.73643960995804
Right: 137.34740819315715
Total: 257.0419239015576

Errors:
Left: 313.92545472354817
Right: 227.38645973669773
Total: 270.65595723012297

Errors:
Left: 495.49704765849225
Right: 129.9687174843392
Total: 308.0466219281573

Errors:
Left: 64.49517745169655
Right: 195.25864323473596
Total: 129.87691034321625

Errors:
Left: 35.732942422259086
Right: 147.2689722100219
Total: 92.9309064159836

Errors:
Left: 121.86613458730999
Right: 165.99580246925046
Total: 143.93096852828023

Errors:
Left: 252.8545148004468
Right: 274.50177929011085
Total: 263.7470873780485

Errors:
Left: 312.2581262819863
Right: 177.68311863034893
Total: 244.9706224561676

Errors:
Left: 244.25604895272463
Right: 340.09991063509455
Total: 293.40674725137586

Errors:
Left: 345.7058517197367
Right: 

Total: 89.5738446846537

Errors:
Left: 251.14417379410588
Right: 198.73286169113132
Total: 224.2665778438625

Errors:
Left: 287.04929406071665
Right: 96.24952591221185
Total: 191.64940998646426

Errors:
Left: 340.4971384177561
Right: 257.7614871998333
Total: 298.8658234737058

Errors:
Left: 340.0672653408129
Right: 329.24969439318517
Total: 334.6584798669991

Errors:
Left: 484.1326194299977
Right: 150.6120734848617
Total: 313.09644202223564

Errors:
Left: 137.88477250854683
Right: 135.87309164529813
Total: 136.87893207692247

Errors:
Left: 260.2240112409254
Right: 390.8386479921809
Total: 327.20587624156923

Errors:
Left: 517.341525910321
Right: 61.08886746883651
Total: 289.21519668957876

Errors:
Left: 348.1668843758799
Right: 137.35571361330554
Total: 241.4270510783739

Errors:
Left: 146.90082780209968
Right: 400.2848537448927
Total: 328.81756437641263

Errors:
Left: 452.04198865565155
Right: 343.61298025907394
Total: 397.82748445736274

Errors:
Left: 109.616874804361
Right: 162.8973

In [9]:
print("Sample size and Avg Error")
print("Training Data")
for key in sorted(training_by_size_error.keys()):
        print(f'{key} : {sum(training_by_size_error[key])/len(training_by_size_error[key])}')
print("Testing Data")
for key in sorted(testing_by_size_error.keys()):
        print(f'{key} : {sum(testing_by_size_error[key])/len(testing_by_size_error[key])}')

Sample size and Avg Error
Training Data
20 : 220.4070908543569
26 : 279.38742587043606
27 : 435.4824670769891
28 : 403.59819414019114
29 : 129.13074410437866
30 : 361.7989763571112
31 : 284.54626751840345
32 : 241.4847257572687
33 : 262.71662582104733
34 : 271.5038729927662
35 : 245.30789038008078
39 : 286.2990484534912
40 : 192.11546445530016
61 : 340.901254755945
62 : 257.56003602796767
63 : 276.71866591212137
64 : 280.76162735929825
65 : 275.0376151225918
68 : 344.12592826723534
78 : 301.7435821519905
79 : 234.42686123387625
125 : 295.4093536251774
126 : 240.6373165839208
130 : 277.7822233111745
156 : 312.3701761778913
157 : 266.1426252927054
251 : 282.62163451813285
312 : 321.228977105194
313 : 290.0199508988125
625 : 307.2186595385773
1250 : 308.0530120129407
2500 : 308.5914450633599
5000 : 309.4181424829448
10000 : 311.0150916593469
Testing Data
20 : 220.4070908543569
26 : 279.38742587043606
27 : 435.4824670769891
28 : 403.59819414019114
29 : 129.13074410437866
30 : 361.798976357

In [10]:
#Fitting a model to training data
training_df = pd.DataFrame(training)
X = training_df.loc[:,:9]
y = training_df.loc[:,10]
from sklearn import linear_model
training_regr = linear_model.LinearRegression()
training_regr.fit(X,y)
print(training_regr.coef_)

[-7.40536206e+11 -1.48107241e+12  7.40536206e+11 -7.85703490e-04
 -2.09730562e+00  1.09244909e-02  5.73918820e-02  6.87451644e-02
 -6.75927285e-02  8.54416001e-03]


In [11]:
#Error
prediction = training_regr.predict(pd.DataFrame(testing[:,:10]))
err = 0
for i in range (len(prediction)):
    err += (testing[i][10]-prediction[i])**2
err /= len(prediction)    
print(err)

19.287457243729015
