In [1]:
# user inputs

fileName = "trainHousePrices.csv"
targetVariable = "SalePrice"
numericalFeatures = ["LotArea", "YrSold", "YearBuilt",  "OverallQual", "Fireplaces", "GarageArea"]
categoricalFeatures = ["Neighborhood", "SaleType", "MSSubClass"]

In [2]:
# imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
%matplotlib inline

In [3]:
# preprocessing functions

def extendDataframeWithOneHotEncoding(columnName, dataframe, features):
    columnValuesDataframe = dataframe[[columnName]]
    labelEncoder = preprocessing.LabelEncoder()
    labelEncoder.fit(columnValuesDataframe)
    columnValuesEnumeratedList = labelEncoder.classes_
    
    extendTestdataWithOneHotEncoding(columnValuesEnumeratedList, columnName, dataframe)
    features.extend(columnValuesEnumeratedList)
    
def extendTestdataWithOneHotEncoding(newColumns, columnName, dataframe):
    
    columnValuesList = dataframe[columnName]
    for newColumnTitle in newColumns:
        newColumnValues = [1 if x == newColumnTitle else 0 for x in columnValuesList]
        dataframe[newColumnTitle] = newColumnValues
    

In [4]:
# machine learning algorithms

from sklearn import cross_validation
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor

def runLinearModel(X, y):
    linearModelUnderTest = linear_model.LinearRegression()
    cross_validation_scores = cross_validation.cross_val_score(linearModelUnderTest, X, y, cv = 5)
    # accuracy. the higher the better, precision. lower the less variation therefore better 
    return ( np.mean(cross_validation_scores), np.std(cross_validation_scores))

def runRegressionTreeModel(X, y):
    elbowX = range(1,45)
    elbowY = []
    elbowSD = []
    for depth in elbowX:
        regressionDecisionTree = DecisionTreeRegressor(random_state=1, max_depth = depth)
        cross_validation_scores = cross_validation.cross_val_score(regressionDecisionTree, X, y, cv = 5)
        elbowY.append( np.mean(cross_validation_scores) )
        elbowSD.append( np.std(cross_validation_scores) )
    
    maxValue = max(elbowY)
    associatedSd = elbowSD[ elbowY.index(maxValue) ]
    # plt.plot(elbowX, elbowY)
    return (maxValue, associatedSd)

In [5]:
def evaluateInputAgainstModels(train, features, y):
    X = train[features]
    print "Evaluating features: " + str(features)
    
    linearModelResult = runLinearModel(X, y)
    print "Linear Model: " + str(linearModelResult)
    
    regressionTreeResult = runRegressionTreeModel(X, y)
    print "Regression Tree Model: " + str(regressionTreeResult)
    


In [18]:
def evaluateUsingAll(fileName, targetVariable, numericalFeatures, categoricalFeatures):
    train = pd.read_csv(fileName) 
    features = numericalFeatures
    for categoricalFeature in categoricalFeatures:
        extendDataframeWithOneHotEncoding(categoricalFeature, train, features)
        
    y = train[targetVariable]
    evaluateInputAgainstModels(train, features, y)
    return train[['LotArea', 'YrSold','CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'WD', 'COD',20, 30,40, 45, 50]]
#    return train[['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker', 'COD', 'CWD', 'Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD', 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190, 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker', 'COD', 'CWD', 'Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD', 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190, 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker', 'COD', 'CWD', 'Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD', 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190]]]
    
evaluateUsingAll(fileName, targetVariable, numericalFeatures, categoricalFeatures)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Evaluating features: ['LotArea', 'YrSold', 'YearBuilt', 'OverallQual', 'Fireplaces', 'GarageArea', 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker', 'COD', 'CWD', 'Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD', 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190, 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker', 'COD', 'CWD', 'Con', 'ConLD', 'ConLI', 'ConLw', 'New', 'Oth', 'WD', 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190, 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR

Unnamed: 0,LotArea,YrSold,CollgCr,Veenker,Crawfor,NoRidge,WD,COD,20,30,40,45,50
0,8450,2008,1,0,0,0,1,0,0,0,0,0,0
1,9600,2007,0,1,0,0,1,0,1,0,0,0,0
2,11250,2008,1,0,0,0,1,0,0,0,0,0,0
3,9550,2006,0,0,1,0,1,0,0,0,0,0,0
4,14260,2008,0,0,0,1,1,0,0,0,0,0,0
5,14115,2009,0,0,0,0,1,0,0,0,0,0,1
6,10084,2007,0,0,0,0,1,0,1,0,0,0,0
7,10382,2009,0,0,0,0,1,0,0,0,0,0,0
8,6120,2008,0,0,0,0,1,0,0,0,0,0,1
9,7420,2008,0,0,0,0,1,0,0,0,0,0,0


In [12]:
train = pd.read_csv(fileName)
train[:5][ ['LotArea',"Neighborhood", "SaleType", "MSSubClass" ,'YrSold', 'YearBuilt', 'OverallQual', 'Fireplaces', 'GarageArea']]

Unnamed: 0,LotArea,Neighborhood,SaleType,MSSubClass,YrSold,YearBuilt,OverallQual,Fireplaces,GarageArea
0,8450,CollgCr,WD,60,2008,2003,7,0,548
1,9600,Veenker,WD,20,2007,1976,6,1,460
2,11250,CollgCr,WD,60,2008,2001,7,1,608
3,9550,Crawfor,WD,70,2006,1915,7,1,642
4,14260,NoRidge,WD,60,2008,2000,8,1,836
