In [37]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv("train.csv",index_col="Id")
test = pd.read_csv("test.csv",index_col="Id")

def print_full(x):
    """
    Full printing of dataframes for error checking
    """
    pd.set_option('display.max_columns', 999)
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')

def clean(df):
    """
    Cleans NaNs and creates new features
    """
    
    # List of new features to be created: (new_feature, original_feature, transform_function)
    transform = [("sqLotArea","LotArea",np.sqrt),
                 ("sqGrLivArea","GrLivArea",np.sqrt),
                 ("sqBsmtFinSF1","BsmtFinSF1",np.sqrt),
                 ("sqBsmtFinSF2","BsmtFinSF2",np.sqrt),
                 ("sqBsmtUnfSF","BsmtUnfSF",np.sqrt),
                 ("sqTotalBsmtSF","TotalBsmtSF",np.sqrt),
                 ("sq1stFlrSF","1stFlrSF",np.sqrt),
                 ("sq2ndFlrSF","2ndFlrSF",np.sqrt),
                 ("sqLotFrontage","LotFrontage",np.sqrt),
                 ("sqMasVnrArea","MasVnrArea",np.sqrt),
                 ("sqPoolArea","PoolArea",np.sqrt),
                 ("sqGarageArea","GarageArea",np.sqrt),
                 ("sqWoodDeckSF","WoodDeckSF",np.sqrt),
                 ("sqOpenPorchSF","OpenPorchSF",np.sqrt),
                 ("sqEnclosedPorch","EnclosedPorch",np.sqrt),
#                  ("expLotArea","LotArea",np.exp),
#                  ("expGrLivArea","GrLivArea",np.exp),
#                  ("expBsmtFinSF1","BsmtFinSF1",np.exp),
#                  ("expBsmtFinSF2","BsmtFinSF2",np.exp),
#                  ("expBsmtUnfSF","BsmtUnfSF",np.exp),
#                  ("expTotalBsmtSF","TotalBsmtSF",np.exp),
#                  ("exp1stFlrSF","1stFlrSF",np.exp),
#                  ("exp2ndFlrSF","2ndFlrSF",np.exp),
#                  ("expLotFrontage","LotFrontage",np.exp),
#                  ("expMasVnrArea","MasVnrArea",np.exp),
#                  ("expPoolArea","PoolArea",np.exp),
#                  ("expGarageArea","GarageArea",np.exp),
#                  ("expWoodDeckSF","WoodDeckSF",np.exp),
#                  ("expOpenPorchSF","OpenPorchSF",np.exp),
#                  ("expEnclosedPorch","EnclosedPorch",np.exp),
                 ("lnLotArea","LotArea",np.log1p),
                 ("lnGrLivArea","GrLivArea",np.log1p),
                 ("lnBsmtFinSF1","BsmtFinSF1",np.log1p),
                 ("lnBsmtFinSF2","BsmtFinSF2",np.log1p),
                 ("lnBsmtUnfSF","BsmtUnfSF",np.log1p),
                 ("lnTotalBsmtSF","TotalBsmtSF",np.log1p),
                 ("ln1stFlrSF","1stFlrSF",np.log1p),
                 ("ln2ndFlrSF","2ndFlrSF",np.log1p),
                 ("lnLotFrontage","LotFrontage",np.log1p),
                 ("lnMasVnrArea","MasVnrArea",np.log1p),
                 ("lnPoolArea","PoolArea",np.log1p),
                 ("lnGarageArea","GarageArea",np.log1p),
                 ("lnWoodDeckSF","WoodDeckSF",np.log1p),
                 ("lnOpenPorchSF","OpenPorchSF",np.log1p),
                 ("lnEnclosedPorch","EnclosedPorch",np.log1p),
                ]
    
    # Find categorical and numerical features
    categoricals = train.select_dtypes(include=["object"]).columns.values
    numericals = [feat for feat in train.select_dtypes(include=["int","float"]).columns.values]
    
    # Remove NaNs... bear in mind this is a rough script. I recommend a more intelligent way of doing this,
    # for example a feature like LotFrontage may be NaN because the property has no Lot Frontage, so it makes
    # more sense to set this to zero instead of imputing the median value as shown below.
    
    # Transform to create new features, scale using MinMaxScaler
    for (new_feature,original_feature,f) in transform: 
        df[new_feature] = df[original_feature].fillna(df[original_feature].median(), inplace = False)
        df[new_feature] = MinMaxScaler().fit_transform(f(df[new_feature].apply(float)).reshape(-1,1))
    # Scale and remove NaNs for numerical features by imputing median value
    for feature in numericals: 
        df[feature].fillna(df[feature].median(), inplace = True)
        df[feature] = MinMaxScaler().fit_transform(df[feature].apply(float).reshape(-1,1))
    # Impute NaNs for categorical features
    for feature in categoricals: 
        df[feature].fillna(df[feature].value_counts().idxmax(), inplace = True)
    # Perform one hot encoding on the categorical features
    for cat in categoricals:
        dummies = pd.get_dummies(df[cat])
        dummies.columns = [col_name + cat for col_name in dummies.columns.values]            
        df = df.drop(cat,axis=1)
        df = df.join(dummies)
    return df

target = train["SalePrice"] # Note that we will take the Log of this when fitting - check the histogram of this feature
train = train.drop("SalePrice",axis=1)

dd = clean(pd.concat([train,test]))

train = dd[:len(train)]
test = dd[len(train):]

In [38]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.grid_search import GridSearchCV   #Perforing grid search
from sklearn.cross_validation import cross_val_score
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

def rmse_cv(model,X,y): # Cross val using the competition scoring metric
    return(np.sqrt(-cross_val_score(model, X, np.log(y), scoring="mean_squared_error", cv = 5)))

# def rmse(actual,pred):
#     return(np.sqrt( ( (actual-pred)**2).mean()))


# dtrain = xgb.DMatrix(train)
# dtarget = xgb.DMatrix(target)
# dtest =  xgb.DMatrix(test)

cv_params = {'learning_rate':[0.01,0.07,0.1],} #, 'min_child_weight': [1,3,5]}
ind_params = {'max_depth': 3,
              'n_estimators': 6000,
              'seed':0,
              "subsample": 0.8,
              "colsample_bytree": 0.8,
              "colsample_bylevel": 0.8, 
             'objective': 'reg:linear'}

# optimized_xgb = GridSearchCV(xgb.XGBRegressor(**ind_params), 
#                             cv_params, 
#                             scoring = 'mean_squared_error', cv = 3, n_jobs = -1) 

# optimized_xgb.fit(train, target)
# print(optimized_xgb.best_params_)
# stop

model = xgb.XGBRegressor(
                 gamma=0.030,                 
                 learning_rate=0.01,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.8,
                 colsample_bytree= 0.8,
                 colsample_bylevel= 0.8,
                )

model.fit(train,np.log(target))
preds = np.exp(model.predict(test))



from sklearn.linear_model import ElasticNet
model2 = ElasticNet(alpha=0.0009).fit(train,np.log(target))
model2.fit(train,np.log(target))
preds2 = np.exp(model2.predict(test))


results = (preds + preds2) / 2.0

test["SalePrice"] = results
test[["SalePrice"]].to_csv("submit.txt")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:

# train = pd.read_csv("train.csv",index_col="Id")
# print(train["LotFrontage"].isnull().sum())
# print(len(train["LotFrontage"]))