In [7]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
train = pd.read_csv("train.csv",index_col="Id")
test = pd.read_csv("test.csv",index_col="Id")

def print_full(x):
    pd.set_option('display.max_columns', 999)
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')

def mape(actual, pred):
    return np.mean(np.abs((actual - pred) / actual)) * 100.0

def banding(series,n_bands):
    """
    Takes in a df column of type float/int and maps this into 'bands' of values between min and max values
    For example band(df["OverallQual"],3) splits into 3 bands evenly spaced between max and min.
    Useful for features like Year Built
    """
    bands = np.linspace(series.min(),series.max(),n_bands)
    band_dic = dict((val,int(key)) for (key,val) in zip(np.arange(1,len(bands)+1,1),bands))
    def allocate_band(y):
        closest = min(bands, key=lambda x:abs(x-y))
        return band_dic[closest]
    return series.apply(lambda x: int(allocate_band(x)))

def scaling(df):
    df = df.fillna(value=0)
    ###### TRANSFORMS
   # df["OverallQual"] = banding(df["OverallQual"],2) 
   # df["OverallCond"] = banding(df["OverallCond"],2) 
   # df["YearBuilt"] = banding(df["YearBuilt"],3)
   # df["TotRmsAbvGrd"] = banding(df["TotRmsAbvGrd"],3)

    df["OverallQual"] = MinMaxScaler().fit_transform(df["OverallQual"].apply(float).reshape(-1,1))
    df["OverallCond"] = MinMaxScaler().fit_transform(df["OverallCond"].apply(float).reshape(-1,1))
    df["YearBuilt"] = MinMaxScaler().fit_transform(df["YearBuilt"].apply(float).reshape(-1,1))
    df["TotRmsAbvGrd"] = MinMaxScaler().fit_transform(df["TotRmsAbvGrd"].apply(float).reshape(-1,1))
    
    
    ## Float > Log > Normalise

    df["LotArea"] = MinMaxScaler().fit_transform(np.log(df["LotArea"]).reshape(-1,1))

    #print(train.loc[train["TotalBsmtSF"] != 0, ["TotalBsmtSF"]])
    #print(train[["TotalBsmtSF"]])

    # Rescale for only the non-zero values
    df.loc[df["TotalBsmtSF"] != 0, ["TotalBsmtSF"]] = MinMaxScaler().fit_transform(
        np.log(df.loc[df["TotalBsmtSF"] != 0, ["TotalBsmtSF"]]))#.reshape(-1,1))

    # This is actually fairly Gaussian w/o log transform
    #train[train["GarageArea"] == 0] = 1
    df["GarageArea"] = MinMaxScaler().fit_transform((df["GarageArea"].apply(float)).reshape(-1,1))
    
    df["GrLivArea"] = MinMaxScaler().fit_transform((df["GrLivArea"].apply(float)).reshape(-1,1))

    # Maybe reonsider this scaling...might be shitty > Check histogram
    df.loc[df["PoolArea"] == 0, ["PoolArea"]] = 1
    df["PoolArea"] = MinMaxScaler().fit_transform((df["PoolArea"].apply(float)).reshape(-1,1))

    df["1stFlrSF"] = MinMaxScaler().fit_transform((df["1stFlrSF"].apply(float)).reshape(-1,1))

    df.loc[df["2ndFlrSF"] == 0, ["2ndFlrSF"]] = 1
    df["2ndFlrSF"] = MinMaxScaler().fit_transform((df["2ndFlrSF"].apply(float)).reshape(-1,1))

    # Now do some one hot encoding on all categoricals
    df["Condition2"] = df["Condition2"].apply(str)+"c2" # fix join bug with encoding
#    df["Utilities"] = df["Utilities"].fillna("AllPub")
    for cat in categoricals:
        print(cat)
        dummies = pd.get_dummies(df[cat])
        df = df.drop(cat,axis=1)
        df = df.join(dummies)
    return df


predictors = ["LotArea","MSSubClass","OverallQual","OverallCond","YearBuilt","TotalBsmtSF","1stFlrSF","2ndFlrSF",
              "TotRmsAbvGrd","GarageArea","PoolArea","YrSold","SaleCondition","SaleType","Condition1","Condition2",
              "Neighborhood","Street","GrLivArea"]

categoricals = ["MSSubClass","YrSold","SaleType","Condition1","Condition2",
              "Neighborhood","Street","SaleCondition"]

target = train["SalePrice"]
train = train.drop("SalePrice",axis=1)

train = train[predictors]
test = test[predictors]

dd = pd.concat([train,test])
dd = scaling(dd)

train = dd[:len(train)]
test = dd[len(train):]

MSSubClass
YrSold
SaleType
Condition1
Condition2
Neighborhood
Street
SaleCondition


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model
import xgboost as xgb

#alg = RandomForestRegressor(n_estimators = 500, oob_score = False, n_jobs = -1,random_state =1,
#                            max_features = 10, min_samples_leaf = 2)

def rmse_cv(model):
    return(np.sqrt(-cross_val_score(model, train, np.log(target), scoring="mean_squared_error", cv = 5)))


# print("=====Random Forest (Regr.) RMSE=====")
# for x in [10,50]:
#     print(x, rmse_cv(RandomForestRegressor(n_estimators = x, n_jobs = -1,random_state =1,
#                             max_features = 4, min_samples_leaf = 2)).mean())

# print("=====Linear Regression RMSE=====")
# lr = linear_model.LinearRegression()
# # Train the model using the training sets
# print(rmse_cv(lr).mean())
    
print("=====Ridge Regression RMSE w/ alphas =====")
for a in [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]:
     print(a, rmse_cv(Ridge(alpha = a)).mean())
best_model = Ridge(alpha=5).fit(train,np.log(target))


#print("=====XGBOOST=====")
#gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train, np.log(target))

#print(rmse_cv(gbm).mean())
#predictions = gbm.predict(test_X)
# for a in [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]:
#     print(a, rmse_cv(Ridge(alpha = a)).mean())
#best_model = gbm

test["SalePrice"] = np.exp(best_model.predict(test))
#print(test["SalePrice"])
    
#print(test[["SalePrice"]])
test[["SalePrice"]].to_csv("submit.txt")

=====XGBOOST=====


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
print(test["SalePrice"])

Id
1461    110000.0
1462    157900.0
1463    181000.0
1464    181000.0
1465    189000.0
1466    173000.0
1467    173000.0
1468    143000.0
1469    170000.0
1470    129000.0
1471    181000.0
1472    100000.0
1473    139000.0
1474    146000.0
1475    143000.0
1476    325000.0
1477    205000.0
1478    306000.0
1479    320000.0
1480    385000.0
1481    268000.0
1482    202500.0
1483    260000.0
1484    199900.0
1485    187500.0
1486    225000.0
1487    340000.0
1488    232000.0
1489    194000.0
1490    190000.0
          ...   
2890    100000.0
2891    140000.0
2892     84500.0
2893    145000.0
2894     84500.0
2895    275000.0
2896    275000.0
2897    207500.0
2898    188000.0
2899    201000.0
2900    165000.0
2901    215000.0
2902    153500.0
2903    320000.0
2904    290000.0
2905    115000.0
2906    176000.0
2907    115000.0
2908    130000.0
2909    171000.0
2910     84500.0
2911     88000.0
2912    180000.0
2913     88000.0
2914     84500.0
2915     84500.0
2916     88000.0
2917    143