In [93]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
train = pd.read_csv("train.csv",index_col="Id")
test = pd.read_csv("test.csv",index_col="Id")

def print_full(x):
    pd.set_option('display.max_columns', 999)
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')

def mape(actual, pred):
    return np.mean(np.abs((actual - pred) / actual)) * 100.0

def banding(series,n_bands):
    """
    Takes in a df column of type float/int and maps this into 'bands' of values between min and max values
    For example band(df["OverallQual"],3) splits into 3 bands evenly spaced between max and min.
    Useful for features like Year Built
    """
    bands = np.linspace(series.min(),series.max(),n_bands)
    band_dic = dict((val,int(key)) for (key,val) in zip(np.arange(1,len(bands)+1,1),bands))
    def allocate_band(y):
        closest = min(bands, key=lambda x:abs(x-y))
        return band_dic[closest]
    return series.apply(lambda x: int(allocate_band(x)))

def scaling(df):
    df = df.fillna(value=0)
    ###### TRANSFORMS
   # df["OverallQual"] = banding(df["OverallQual"],2) 
   # df["OverallCond"] = banding(df["OverallCond"],2) 
   # df["YearBuilt"] = banding(df["YearBuilt"],3)
   # df["TotRmsAbvGrd"] = banding(df["TotRmsAbvGrd"],3)

    df["OverallQual"] = MinMaxScaler().fit_transform(df["OverallQual"].apply(float).reshape(-1,1))
    df["OverallCond"] = MinMaxScaler().fit_transform(df["OverallCond"].apply(float).reshape(-1,1))
    df["YearBuilt"] = MinMaxScaler().fit_transform(df["YearBuilt"].apply(float).reshape(-1,1))
    df["TotRmsAbvGrd"] = MinMaxScaler().fit_transform(df["TotRmsAbvGrd"].apply(float).reshape(-1,1))
    
    
    ## Float > Log > Normalise

    df["LotArea"] = MinMaxScaler().fit_transform(np.log(df["LotArea"]).reshape(-1,1))

    #print(train.loc[train["TotalBsmtSF"] != 0, ["TotalBsmtSF"]])
    #print(train[["TotalBsmtSF"]])

    # Rescale for only the non-zero values
    df.loc[df["TotalBsmtSF"] != 0, ["TotalBsmtSF"]] = MinMaxScaler().fit_transform(
        np.log(df.loc[df["TotalBsmtSF"] != 0, ["TotalBsmtSF"]]))#.reshape(-1,1))

    # This is actually fairly Gaussian w/o log transform
    #train[train["GarageArea"] == 0] = 1
    df["GarageArea"] = MinMaxScaler().fit_transform((df["GarageArea"].apply(float)).reshape(-1,1))
    
    df["GrLivArea"] = MinMaxScaler().fit_transform((df["GrLivArea"].apply(float)).reshape(-1,1))

    # Maybe reonsider this scaling...might be shitty > Check histogram
    df.loc[df["PoolArea"] == 0, ["PoolArea"]] = 1
    df["PoolArea"] = MinMaxScaler().fit_transform((df["PoolArea"].apply(float)).reshape(-1,1))

    df["1stFlrSF"] = MinMaxScaler().fit_transform((df["1stFlrSF"].apply(float)).reshape(-1,1))

    df.loc[df["2ndFlrSF"] == 0, ["2ndFlrSF"]] = 1
    df["2ndFlrSF"] = MinMaxScaler().fit_transform((df["2ndFlrSF"].apply(float)).reshape(-1,1))

    # Now do some one hot encoding on all categoricals
    df["Condition2"] = df["Condition2"].apply(str)+"c2" # fix join bug with encoding
#    df["Utilities"] = df["Utilities"].fillna("AllPub")
    for cat in categoricals:
        print(cat)
        dummies = pd.get_dummies(df[cat])
        df = df.drop(cat,axis=1)
        df = df.join(dummies)
    return df


predictors = ["LotArea","MSSubClass","OverallQual","OverallCond","YearBuilt","TotalBsmtSF","1stFlrSF","2ndFlrSF",
              "TotRmsAbvGrd","GarageArea","PoolArea","YrSold","SaleCondition","SaleType","Condition1","Condition2",
              "Neighborhood","Street","GrLivArea"]

categoricals = ["MSSubClass","YrSold","SaleType","Condition1","Condition2",
              "Neighborhood","Street","SaleCondition"]

target = train["SalePrice"]
train = train.drop("SalePrice",axis=1)

train = train[predictors]
test = test[predictors]

dd = pd.concat([train,test])
dd = scaling(dd)

train = dd[:len(train)]
test = dd[len(train):]

MSSubClass
YrSold
SaleType
Condition1
Condition2
Neighborhood
Street
SaleCondition


In [95]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn import cross_validation

alg = RandomForestRegressor(n_estimators = 500, oob_score = False, n_jobs = -1,random_state =1,
                            max_features = 10, min_samples_leaf = 2)

from sklearn.cross_validation import cross_val_score

def rmse_cv(model):
   # print(cross_val_score(model, train, np.log(target), scoring="mean_squared_error", cv = 5))
    rmse= np.sqrt(-cross_val_score(model, train, np.log(target), scoring="mean_squared_error", cv = 5))
    return(rmse)

for x in [100,500,1000]:
    print(rmse_cv(RandomForestRegressor(n_estimators = x, n_jobs = -1,random_state =1,
                            max_features = 4, min_samples_leaf = 2)).mean())

for a in [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]:
    print(a, rmse_cv(Ridge(alpha = a)).mean())

#print(test[["SalePrice"]])
#test[["SalePrice"]].to_csv("submit.txt")

0.172407354487
0.171929755477
0.171594215489
0.05 0.1492163228
0.1 0.148650980244
0.3 0.14710080085
1 0.144798017187
3 0.143980380788
5 0.144949173657
10 0.149382645815
15 0.154782867603
30 0.171232675249
50 0.1901697541
75 0.209064268614


In [96]:
import xgboost as xgb


ImportError: No module named 'xgboost'