In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as XGB
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

%matplotlib inline




In [3]:
#Add mean when calling the function if you want one score, eg: rmse_cv(model_lasso).mean()
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [4]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


In [5]:
def train_model(X, y, parameters):
    scoring = make_scorer(mean_squared_error, greater_is_better=True)
    estimator = XGB.XGBRegressor(seed=42, max_depth=3, n_estimators=400)
    parameters = {}
    model = GridSearchCV(estimator, param_grid=parameters, scoring=scoring)
    model.fit(X, y)
    return model.best_estimator_

In [6]:
def create_total_sf(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'] 
    

In [7]:
def standardize_numerical_features(df,feature_list):
    for feature in feature_list:
        feature_series= df.loc[:,feature]
        df.loc[:,feature] = (feature_series - feature_series.mean())/feature_series.std()

In [8]:
def get_dfs_for_prediction(features, train,test, all_data):
    #creating matrices for sklearn:
    y_train = train.SalePrice
    y_test = test.SalePrice
    y_all_data = all_data.SalePrice
    X_train = train[features]
    X_test = test[features]
    X_all_data = all_data[features]
    return y_train, y_test, y_all_data, X_train, X_test, X_all_data


In [None]:
train = pd.read_csv('our_train.csv')
test = pd.read_csv('our_validation.csv')
all_data = train.append(test)


create_total_sf(train)
standardize_numerical_features(train,['TotalSF',"GrLivArea"])
create_total_sf(test)
standardize_numerical_features(test,['TotalSF',"GrLivArea"])
create_total_sf(all_data)
standardize_numerical_features(all_data,['TotalSF',"GrLivArea"])


parameters = {}
features = ['TotalSF',"GrLivArea"]


y_train, y_test, y_all_data, X_train, X_test, X_all_data = get_dfs_for_prediction(features, train, test, all_data)

target ="SalePrice"
model = train_model(X_train, y_train, parameters)

title = "Learning Curves (XGB)"
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
print (mean_squared_error(y_test, model.predict(X_test)))
print (model)

print plot_learning_curve(model, title, X_all_data, y_all_data, cv=cv, n_jobs=4)


3048026800.69
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=1)
