In [1]:
print(__doc__)

# Author: Rodolfo Wottrich
# for CMPUT 551
# Code source based on Jaques Grobler's
# License: BSD 3 clause


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate.csv')

X = realestate.ix[:,0:109]
y = realestate.ix[:,110:111]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)
error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)

    error.append(np.mean( np.abs(y_ - y_test)/ y_test))

print 'Average error, non-normalized data: ', np.mean(error)

X_original = X.values #returns a numpy array
min_max_scaler_x = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler_x.fit_transform(X_original)
X_normalized = pd.DataFrame(X_scaled)

y_original = y.values #returns a numpy array
min_max_scaler_y = preprocessing.MinMaxScaler()
y_scaled = min_max_scaler_y.fit_transform(y_original)
y_normalized = pd.DataFrame(y_scaled)

error = list()
for train, test in kf:
    X_train = X_normalized.ix[train]
    X_test = X_normalized.ix[test]
    y_train = y_normalized.ix[train]
    y_test = y_normalized.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, normalized data: ', np.mean(error)

y_ = regr.fit(X, y)

## Partial results:
##     18.7% mean relative error for non-normalized data;
##     For normalized data, weird predictions overflow error calculation

Automatically created module for IPython interactive environment
Average error, non-normalized data:  0.178070746577
Average error, normalized data:  inf




In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate.csv')

#sel = VarianceThreshold(threshold=(.7 * (1 - .7)))
#nrm_X = sel.fit_transform(nrm_X)
#retained = sel.get_support()
#print retained
X = realestate.ix[:,0:109]
y = realestate.ix[:,110:111]
kbest = SelectKBest(f_regression, k=7)
X = kbest.fit_transform(X, y)
retained2 = kbest.get_support()
print retained2
X = pd.DataFrame(X)

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, feature selection: ', np.mean(error)

[ True False False False  True  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True  True False False False False False False False
 False]
Average error, feature selection:  0.168150913843


  y = column_or_1d(y, warn=True)


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate.csv')

#sel = VarianceThreshold(threshold=(.7 * (1 - .7)))
#nrm_X = sel.fit_transform(nrm_X)
#retained = sel.get_support()
#print retained
X = realestate.ix[:,0:109]
y = realestate.ix[:,110:111]
kbest = SelectKBest(f_regression, k=7)
X = kbest.fit_transform(X, y)
retained2 = kbest.get_support()
X = pd.DataFrame(X)

X_original = X.values #returns a numpy array
min_max_scaler_x = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler_x.fit_transform(X_original)
X_normalized = pd.DataFrame(X_scaled)

y_original = y.values #returns a numpy array
min_max_scaler_y = preprocessing.MinMaxScaler()
y_scaled = min_max_scaler_y.fit_transform(y_original)
y_normalized = pd.DataFrame(y_scaled)

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X_normalized.ix[train]
    X_test = X_normalized.ix[test]
    y_train = y_normalized.ix[train]
    y_test = y_normalized.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, feature selection: ', np.mean(error)

Average error, feature selection:  inf


  y = column_or_1d(y, warn=True)


In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate.csv')

X = realestate.ix[:,1:3]
y = realestate.ix[:,110:111]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, lat/long only: ', np.mean(error)

Average error, lat/long only:  0.476007524248


In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate.csv')

X = realestate.ix[:,0:1]
y = realestate.ix[:,110:111]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, assessed values only: ', np.mean(error)

Average error, assessed values only:  0.247622994529


In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_condo.csv')

X = realestate.ix[:,0:105]
y = realestate.ix[:,105:106]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)
error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)

    error.append(np.mean( np.abs(y_ - y_test)/ y_test))

print 'Average error, non-normalized data, condo dataset: ', np.mean(error)

Average error, non-normalized data, condo dataset:  0.187490341788


In [8]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_residential.csv')

X = realestate.ix[:,0:82]
y = realestate.ix[:,82:83]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)
error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)

    error.append(np.mean( np.abs(y_ - y_test)/ y_test))

print 'Average error, non-normalized data, residential dataset: ', np.mean(error)

Average error, non-normalized data, residential dataset:  0.159796206428


In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_condo.csv')

X = realestate.ix[:,1:3]
y = realestate.ix[:,105:106]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, lat/long only, condo dataset: ', np.mean(error)

Average error, lat/long only, condo dataset:  0.389881454078


In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_residential.csv')

X = realestate.ix[:,1:3]
y = realestate.ix[:,82:83]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, lat/long only: ', np.mean(error)

Average error, lat/long only:  0.321407925808


In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_condo.csv')

X = realestate.ix[:,0:1]
y = realestate.ix[:,105:106]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, assessed values only, condo dataset: ', np.mean(error)

Average error, assessed values only, condo dataset:  0.264715158048


In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_residential.csv')

X = realestate.ix[:,0:1]
y = realestate.ix[:,82:83]

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, assessed values only, residential dataset: ', np.mean(error)

Average error, assessed values only, residential dataset:  0.225016019177


In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_condo.csv')

#sel = VarianceThreshold(threshold=(.7 * (1 - .7)))
#nrm_X = sel.fit_transform(nrm_X)
#retained = sel.get_support()
#print retained
X = realestate.ix[:,0:105]
y = realestate.ix[:,105:106]
kbest = SelectKBest(f_regression, k=7)
X = kbest.fit_transform(X, y)
retained2 = kbest.get_support()
X = pd.DataFrame(X)

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, feature selection, condo dataset: ', np.mean(error)

Average error, feature selection, condo dataset:  0.195096943044


  y = column_or_1d(y, warn=True)


In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, cross_validation, metrics
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

pd.set_option('display.max_rows', 1000)

# Load realestate dataset

realestate = pd.read_csv('../../data/realestate/realestate_residential.csv')

#sel = VarianceThreshold(threshold=(.7 * (1 - .7)))
#nrm_X = sel.fit_transform(nrm_X)
#retained = sel.get_support()
#print retained
X = realestate.ix[:,0:82]
y = realestate.ix[:,82:83]
kbest = SelectKBest(f_regression, k=7)
X = kbest.fit_transform(X, y)
retained2 = kbest.get_support()
X = pd.DataFrame(X)

# Create linear regression object
regr = linear_model.LinearRegression()

kf = KFold(len(X), n_folds=5, shuffle=True)

error = list()
for train, test in kf:
    X_train = X.ix[train]
    X_test = X.ix[test]
    y_train = y.ix[train]
    y_test = y.ix[test]

    # Train the model using the training sets
    y_ = regr.fit(X_train, y_train).predict(X_test)
    
    individual_errors = np.abs(y_ - y_test)/ y_test
    error.append(np.mean(individual_errors))

print 'Average error, feature selection, residential dataset: ', np.mean(error)

Average error, feature selection, residential dataset:  0.164841899523


  y = column_or_1d(y, warn=True)
