In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

In [80]:
df = pd.read_csv("../Oblig1/realestate.csv")
df.head()

#f.info()

Unnamed: 0.1,Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [81]:

df['target'] = df['Y house price of unit area']
df = df.drop(["Unnamed: 0", "No", 'Y house price of unit area'], axis=1)
df.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,target
0,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [None]:
df['Y house price of unit area']

In [82]:
def split_dataset(data):
    train_val = data.sample(frac=0.8)
    test = data.drop(train_val.index)
    train = train_val.sample(frac=0.75)
    validation = train_val.drop(train.index)    
    return [train, validation, test]

# funksjon for 책 dele opp i targets og features
def split_targets_and_features(data):
    targets = data.target
    features = data.drop("target", axis=1)
    return [targets, features]

# hjelpe funksjon for oppdeling i features og targets
def create_set_of_targets_and_features(data):
    train = split_targets_and_features(data[0])
    validation = split_targets_and_features(data[1])
    test = split_targets_and_features(data[2])

    return [train, validation, test]

# funksjon for spliting av dataset
def create_full_dataset(input):
    data = split_dataset(input)
    dataset = create_set_of_targets_and_features(data)

    return dataset


In [None]:
# hjelpe fuksjon for vurdering av modellene        
def evaluate_model(classifier, features, targets):
    predictions = classifier.predict(features)
    absolute = mean_absolute_error(targets, predictions)
    squared = mean_squared_error(targets, predictions)
    r2 = r2_score(targets, predictions) 

    return [absolute, squared, r2]

# hjelpe funksjon for 책 legg til verdier i resultat array i validate og test funksjonen 
def append_values(results, absolute, squared, r2):
        absolute.append(results[0])
        squared.append(results[1])
        r2.append(results[2])        

# hjelpe funksjon for 책 printe resultatene
def results_print(absolute, squared, r2):
    print(f'Mean absolute error: {np.mean(absolute):.2f} +- {np.std(absolute):.2f}')
    print(f'Mean squared error: {np.mean(squared):.2f} +- {np.std(squared):.2f}')
    print(f'R2 score: {np.mean(r2):.2f} +- {np.std(r2):.2f}')  

# funksjon for 책 trening og hyperparameter tuning av modellen med mulighet intern looping i fuksjonen.
def validate(classifier, runs):    
    train_absolute, train_squared, train_r2 = [], [], []
    validate_absolute, validate_squared, validate_r2 = [], [], []

    if(runs < 1): runs = 1

    for i in range(0, runs, 1):
        dataset = create_full_dataset(df)
        classifier.fit(dataset[0][1], dataset[0][0])
    
        results_train = evaluate_model(classifier, dataset[0][1], dataset[0][0])
        append_values(results_train, train_absolute, train_squared, train_r2)
    
        results_validate = evaluate_model(classifier, dataset[1][1], dataset[1][0])
        append_values(results_validate, validate_absolute, validate_squared, validate_r2)

    print('Training results:')
    results_print(train_absolute, train_squared, train_r2)
    print('************************')
    print('Validation results:')
    results_print(validate_absolute, validate_squared, validate_r2)

# funksjon for endelig testing av modellene med mulighet intern looping av fuksjonen.
def test(classifier, runs):
    absolute, squared, r2 = [], [], []

    if(runs < 1): runs = 1

    for i in range(0, runs, 1):
        dataset = create_full_dataset(df)
        classifier.fit(dataset[0][1], dataset[0][0])

        results = evaluate_model(classifier, dataset[2][1], dataset[2][0])
        append_values(results, absolute, squared, r2)
    
    print('************************')
    print('Test results:')
    results_print(absolute, squared, r2)


In [92]:
dt1 = DecisionTreeRegressor()
validate(dt1, 20)

Training results:
Mean absolute error: 0.09 +- 0.04
Mean squared error: 0.31 +- 0.19
R2 score: 1.00 +- 0.00
************************
Validation results:
Mean absolute error: 6.46 +- 0.75
Mean squared error: 102.96 +- 41.68
R2 score: 0.37 +- 0.27


In [94]:
lr1 = LinearRegression()
validate(lr1, 20)

Training results:
Mean absolute error: 6.12 +- 0.35
Mean squared error: 78.26 +- 12.12
R2 score: 0.58 +- 0.05
************************
Validation results:
Mean absolute error: 6.19 +- 0.77
Mean squared error: 71.32 +- 27.13
R2 score: 0.59 +- 0.10
