# STEP 1 - LOAD DATA

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
import catboost
from sklearn import preprocessing
from sklearn.model_selection import KFold

data_train = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\train2.csv')
data_test = pd.read_csv('test (1).csv')
print("Done")

def rmsle(actual, predicted):
    return np.sqrt(np.mean((np.power(np.log(np.array(actual)+1) - 
            np.log(np.array(predicted)+1), 2))))

all_data = pd.concat((data_train.loc[:,'MSSubClass':'SaleCondition'],
                      data_test.loc[:,'MSSubClass':'SaleCondition']))
all_data = all_data.reset_index()
all_data= all_data.drop("index", axis = 1)

Done


In [2]:
total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum()/all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data[:10]

Unnamed: 0,Total,Percent
PoolQC,2901,0.996565
MiscFeature,2806,0.96393
Alley,2713,0.931982
Fence,2344,0.805222
FireplaceQu,1417,0.486774
LotFrontage,486,0.166953
GarageCond,157,0.053933
GarageQual,157,0.053933
GarageYrBlt,157,0.053933
GarageFinish,157,0.053933


From above, we can see that columns PoolQC, MiscFeature, Alley and Fence have the most NaN values. We can see that it identifies the absence of corresponding features - Pools, Alleys, Fences. MiscFeature does not seem to add importance, so we will remove it.
We also want to remove the columns where most data values are the same. Those are: 'Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating'. After several tries, turned out that only Utilities, RoofMatl, Heating are irrelevant, so we will keep the rest.
We should also pay attention to the features that have the same meaning. For example GarageArea = GarageCars, TotalBsmt = 1stFlrSF almost everywhere and GrLivArea = TotRmsAbvGrd. We will remove one of each pair.

In [3]:
#remove columns where most of values are NAs 
#all_data = all_data.drop('Alley', axis=1)
#all_data = all_data.drop('PoolQC', axis=1)
all_data = all_data.drop('MiscFeature', axis=1)

#try to remove features where most are 1 class
#all_data = all_data.drop('Street', axis = 1)
all_data = all_data.drop('Utilities', axis = 1)
#all_data = all_data.drop('Condition2', axis = 1)
all_data = all_data.drop('RoofMatl', axis = 1)
all_data = all_data.drop('Heating', axis = 1)
#GarageArea and GarageCars are basically the same, so we only need one to reduce redundancy
all_data = all_data.drop('GarageArea', axis = 1)
#same as TotalBsmt
all_data = all_data.drop('1stFlrSF', axis = 1)
#same as GrLivArea mor or less
all_data = all_data.drop('TotRmsAbvGrd', axis = 1)

Nest, we want to identify which features have missing values so that we can deal with them.

In [4]:
cat_hasnull = [col for col in all_data.select_dtypes(['object']) if all_data[col].isnull().any()]
cat_hasnull

['MSZoning',
 'Alley',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'SaleType']

It makes sense that Garage was built at the same time with the building. At least it is true in most cases in the dataset. So we will substitute NAs in GarageYrBlt with YearBuilt.
In Exterior1st and Exterior2 most likely NAs are present as no appropriate class is present. We will replace them with Other.

In [5]:
#it is reasonable that garage was built after the house was built, so we can replace missing values
all_data["GarageYrBlt"] = all_data["GarageYrBlt"].fillna(all_data["YearBuilt"])
#most likely Nas are due to the absense of appropriate class. Let's mark it with 'Other
all_data["Exterior1st"] = all_data["Exterior1st"].fillna('Other')
all_data["Exterior2nd"] = all_data["Exterior1st"].fillna('Other')

In 'Electical' we will replace NAs with SBrkr, as it is the most common class. In the same way, we replace NAs in 'Functional' with 'Typ'.
For the features identifying some feature of some facility, NA probably means the absense of this facility. We will replace NAs with 0 in such colunmns.

In [6]:
cat_hasnull = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType',
              'GarageFinish', 'GarageQual','GarageCond', 'Fence', "Alley", "PoolQC"]
for col in cat_hasnull:
    null_idx = all_data[col].isnull()
    all_data.loc[null_idx, col] = 0
null_idx_el = all_data['Electrical'].isnull()
all_data.loc[null_idx_el, 'Electrical'] = 'SBrkr'
null_idx_fu = all_data['Functional'].isnull()
all_data.loc[null_idx_fu, 'Functional'] = 'Typ'

Next, we create new features:
- Remodelled - indicates if the house was remodelled after construction
- YearSinceRemodelled - indicates how many years ago the house was remodelled
- Age - indicates how old the house is
- HowFastSold - indicates how wast the house was sold at the first time. Houses that were sold faster should be better.
- Has2ndFloor - indicates if the house has more than 1 floor
- Season - indicates the season when the house was built. The price range vary per season

In [7]:
all_data["Remodelled"] = (all_data["YearBuilt"] != all_data["YearRemodAdd"]).astype(int)
all_data["YearSinceRemodelled"] = (2017 - all_data["YearRemodAdd"])
all_data['Age'] = 2017 - all_data["YearBuilt"]
all_data['HowFastSold'] = all_data["YrSold"] - all_data["YearBuilt"]
all_data["Has2ndFloor"] = (all_data["2ndFlrSF"] != 0).astype(int)

#all_data["SaneSold"] = (all_data["YearBuilt"] == all_data["YrSold"]).astype(int)
for i in range(0,all_data.shape[0]):
    if ((all_data.iloc[i]['MoSold'] == 1) | (all_data.iloc[i]['MoSold'] == 12) |  (all_data.iloc[i]['MoSold'] == 2)):
           all_data.loc[i,"Season"] = "winter"
    if ((all_data.iloc[i]['MoSold'] == 3) | (all_data.iloc[i]['MoSold'] == 4) |  (all_data.iloc[i]['MoSold'] == 5)):
           all_data.loc[i,"Season"] = "spring"
    if ((all_data.iloc[i]['MoSold'] == 6) | (all_data.iloc[i]['MoSold'] == 7) |  (all_data.iloc[i]['MoSold'] == 8)):
           all_data.loc[i,"Season"] = "summer"
    if ((all_data.iloc[i]['MoSold'] == 9) | (all_data.iloc[i]['MoSold'] == 10) |  (all_data.iloc[i]['MoSold'] == 11)):
           all_data.loc[i,"Season"] = "autumn"

MSSubClass is represented as integers, although logically this is the categorical feature. We will transform it then. In CentralAir, we substitute Yes with 1, No with 0.

In [8]:
all_data['MSSubClass'] = all_data['MSSubClass'].astype('category')
#change Y with 1, N with 0
all_data['CentralAir'] = (all_data['CentralAir'] == 'Y').astype(int)
#all_data.YrSold = all_data.loc[:, "YrSold"].max() - all_data.YrSold

Change categorical values in appropriate features to numeric, as it is clear that Excellent condition is > than Good condition.

In [9]:
all_data = all_data.replace(
    {
        'ExterQual': {'Ex': 5,
                      'Gd': 4,
                      'TA': 3,
                      'Fa': 2,
                      'Po': 1
                     },
        'ExterCond': {'Ex': 5,
                      'Gd': 4,
                      'TA': 3,
                      'Fa': 2,
                      'Po': 1
                     }
    }
)
all_data = all_data.replace(
    {
        'BsmtQual': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            '0': 0
        },
        'BsmtExposure': {
            'Gd': 3,
            'Av': 2,
            'Mn': 1,
            'No': 0,
            '0': 0
        },
        'BsmtCond': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            '0': 0
        },
    }
)
all_data = all_data.replace(
    {
        'HeatingQC': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1            
        },
        'KitchenQual': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1
        },
        'Functional': {
            'Typ': 0,
            'Min1': 1,
            'Min2': 1,
            'Mod': 2,
            'Maj1': 3,
            'Maj2': 4,
            'Sev': 5,
            'Sal': 6
            },
        'FireplaceQu': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            '0': 0
        },
        'GarageQual': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            '0': 0
        },
        'GarageCond': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            '0': 0
        },
        'PavedDrive': {
            'Y': 1,
            'P': 0.5,
            'N': 0
        },
        'PoolQC': {
            'Ex': 5,
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            '0': 0,
        },
        'Fence': {
            'GdPrv': 2,
            'MnPrv': 1,
            'GdWo': 2,
            'MnWw': 1,
            '0': 0
        }
    }
)

Introduce new features.
- GoodNeighborhood for neighborhoods with most expensive houses on average 
- features that show that some facilities have good quality or poor quality
- linear modification of features with the highest correlation with SalePrice
- total area, calculated differently from GrLivArea
- indicators of presence of fireplace and wooddeck
- indicator of 'good sub class'

In [10]:
all_data['GoodNeighborhood'] = ((all_data['Neighborhood'] == 'StoneBr') | (all_data['Neighborhood'] == 'NridgHt') | (all_data['Neighborhood'] == 'NoRidge')).astype(str)
#make use of highly-correlated features
all_data['GoodQu'] = (all_data['OverallQual'] > 6).astype(int)
all_data['GoodCon'] = (all_data['OverallCond'] > 6).astype(int)
#PLOHO
all_data['GoodExQu'] = ((all_data['ExterQual'] == 5) | (all_data['ExterQual'] == 4)).astype(int)

#all_data['GoodExCond'] = ((all_data['ExterCond'] == 'Ex') | (all_data['ExterCond'] == 'Gd')).astype(int)
#all_data['ExBsmtCond'] = (all_data['BsmtCond'] == 'Ex').astype(int)

#OCHEN PLOHO
all_data['GoodBsmtQual'] = (all_data['BsmtQual'] == 5).astype(int)

#PLOHO
all_data['GoodHeat'] = (all_data['HeatingQC'] == 5).astype(int)

#all_data['GoodKitchen'] = ((all_data['KitchenQual'] == 'Ex') | (all_data['KitchenQual'] == 'Gd')).astype(int)
#PLOHO
all_data['FinGarage'] = (all_data["GarageFinish"] == 'Fin').astype(int)
all_data['New'] = (all_data["SaleType"] == 'New').astype(int)


all_data['PoorQ'] = (all_data["ExterQual"] == 1).astype(int)
all_data['PoorKi'] = (all_data["KitchenQual"] == 1).astype(int)

all_data["OverallQual-s2"] = np.sqrt(all_data["OverallQual"])
all_data["GrLivArea-s2"] = np.sqrt(all_data["GrLivArea"])
all_data["GarageCars-s2"] = np.sqrt(all_data["GarageCars"])
all_data["TotalBsmtSF-s2"] = all_data["TotalBsmtSF"] ** 2
all_data["FullBath-s2"] = all_data["FullBath"] ** 2
all_data["YearBuilt-s2"] = all_data["YearBuilt"] ** 2
all_data["YearRemodAdd-s2"] = all_data["YearRemodAdd"] ** 2

all_data["TotArea"] = all_data["GrLivArea"] + all_data["TotalBsmtSF"]
all_data["HasFirePlace"] =all_data["Fireplaces"]>0 
all_data["HasWoodDeck "] =all_data["WoodDeckSF"]>0
all_data["GoodSubClass"] = ((all_data["MSSubClass"] == 120) | (all_data["MSSubClass"] == 60) | (all_data["MSSubClass"] == 20)).astype(str)


Transform categorical features into dummies

In [11]:
all_data = pd.get_dummies(all_data)

In [12]:
all_data = all_data.fillna(all_data.mean())

In [13]:
rf = RandomForestRegressor()
rf.fit(all_data[:data_train.shape[0]], data_train['SalePrice'])
names = list(all_data[:data_train.shape[0]])
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 20), rf.feature_importances_), names), 
             reverse=True)

Features sorted by their score:
[(0.29121488679358287, 'OverallQual'), (0.2718334838295551, 'TotArea'), (0.1941348303743891, 'OverallQual-s2'), (0.03965401587008142, '2ndFlrSF'), (0.018849545396280937, 'KitchenQual'), (0.014615507481852526, 'LotArea'), (0.013125796484790803, 'HowFastSold'), (0.012953131287493591, 'MasVnrArea'), (0.008845675330794934, 'BsmtUnfSF'), (0.007756166638203547, 'BsmtFinSF1'), (0.006645265059550332, 'GrLivArea-s2'), (0.005766963403953539, 'GrLivArea'), (0.005553358853751831, 'TotalBsmtSF-s2'), (0.005295069768921226, 'GarageYrBlt'), (0.005263234060356277, 'YearSinceRemodelled'), (0.004895714353773929, 'OpenPorchSF'), (0.004647319712244798, 'LotFrontage'), (0.004421433994466137, 'OverallCond'), (0.004092026219860453, 'GoodQu'), (0.003989397105370784, 'GarageCars-s2'), (0.003963921891659509, 'YearBuilt'), (0.003532788748772968, 'FireplaceQu'), (0.0027490017147206833, 'GoodBsmtQual'), (0.0027215076988404085, 'WoodDeckSF'), (0.002649801343220982, 'BsmtExposure'), (0

Drop features that cause overfitting or reduce accuracy, according to analysis

In [14]:
#drop irrelevant features
all_data = all_data.drop('MSZoning_C (all)', axis=1)
all_data = all_data.drop('MSSubClass_160', axis=1)
#all_data = all_data.drop('Electrical_Mix', 1)
#all_data = all_data.drop('GarageQual_Ex', 1)
#all_data = all_data.drop('HouseStyle_2.5Fin', 1)
all_data = all_data.drop('MSSubClass_150', axis = 1)
#all_data = all_data.drop('Functional_Sev', axis = 1)
#all_data = all_data.drop('HeatingQC_Po', 1)


Features Condition1_.. and Condition2_.. represent the presence of some facility. We analyze them and transform to the features, identifyingg the presence of this facility. For example, we can imagine a house with values 'Artery' and 'RRNn'. It can be represented either as {Condition1_Artery == 1, Condition1_RRNn == 0, Condition2_RRNn == 1, Condition2_Artery == 0} or {Condition1_Artery == 0, Condition1_RRNn == 1, Condition2_RRNn == 0, Condition2_Artery == 1}. Such features add noise, so we will create features 'Artery' and 'Feedr' respectively.
Same goes for features Exterior1st_.., Exterior2nd_.. and BsmtFinType1_.., BsmtFinType2_...

In [15]:
#After getting dummies of Condition1 and Condition2 we might add noize to the data. Let's group them:
all_data["Artery"] = (all_data["Condition1_Artery"] | all_data["Condition2_Artery"]).astype(int)
all_data["Feedr"] = (all_data["Condition1_Feedr"] | all_data["Condition2_Feedr"]).astype(int)
all_data["Norm"] = (all_data["Condition1_Norm"] | all_data["Condition2_Norm"]).astype(int)
all_data["RRNn"] = (all_data["Condition1_RRNn"] | all_data["Condition2_RRNn"]).astype(int)
all_data["RRAn"] = (all_data["Condition1_RRAn"] | all_data["Condition2_RRAn"]).astype(int)
all_data["PosN"] = (all_data["Condition1_PosN"] | all_data["Condition2_PosN"]).astype(int)
all_data["PosA"] = (all_data["Condition1_PosA"] | all_data["Condition2_PosA"]).astype(int)
all_data["RRNe"] = (all_data["Condition1_RRNe"]).astype(int)
all_data["RRAe"] = (all_data["Condition1_RRAe"] | all_data["Condition2_RRAe"]).astype(int)
all_data = all_data.drop("Condition1_Artery", axis = 1)
all_data = all_data.drop("Condition1_Feedr", axis = 1)
all_data = all_data.drop("Condition1_Norm", axis = 1)
all_data = all_data.drop("Condition1_RRNn", axis = 1)
all_data = all_data.drop("Condition1_RRAn", axis = 1)
all_data = all_data.drop("Condition1_PosN", axis = 1)
all_data = all_data.drop("Condition1_PosA", axis = 1)
all_data = all_data.drop("Condition1_RRNe", axis = 1)
all_data = all_data.drop("Condition1_RRAe", axis = 1)
all_data = all_data.drop("Condition2_Artery", axis = 1)
all_data = all_data.drop("Condition2_Feedr", axis = 1)
all_data = all_data.drop("Condition2_Norm", axis = 1)
all_data = all_data.drop("Condition2_RRNn", axis = 1)
all_data = all_data.drop("Condition2_RRAn", axis = 1)
all_data = all_data.drop("Condition2_PosN", axis = 1)
all_data = all_data.drop("Condition2_PosA", axis = 1)
#all_data = all_data.drop("Condition2_RRNe")
all_data = all_data.drop("Condition2_RRAe", axis = 1)

In [16]:
#same for Exteriors 1 and 2
all_data["AsbShng"] = (all_data["Exterior1st_AsbShng"] | all_data["Exterior2nd_AsbShng"]).astype(int)
all_data["AsphShn"] = (all_data["Exterior1st_AsphShn"] | all_data["Exterior2nd_AsphShn"]).astype(int)
all_data["BrkComm"] = (all_data["Exterior1st_BrkComm"] | all_data["Exterior2nd_BrkComm"]).astype(int)
all_data["BrkFace"] = (all_data["Exterior1st_BrkFace"] | all_data["Exterior2nd_BrkFace"]).astype(int)
all_data["CBlock"] = (all_data["Exterior1st_CBlock"] | all_data["Exterior2nd_CBlock"]).astype(int)
all_data["CemntBd"] = (all_data["Exterior1st_CemntBd"] | all_data["Exterior2nd_CemntBd"]).astype(int)
all_data["HdBoard"] = (all_data["Exterior1st_HdBoard"] | all_data["Exterior2nd_HdBoard"]).astype(int)
all_data["ImStucc"] = (all_data["Exterior1st_ImStucc"] | all_data["Exterior2nd_ImStucc"]).astype(int)
all_data["MetalSd"] = (all_data["Exterior1st_MetalSd"] | all_data["Exterior2nd_MetalSd"]).astype(int)
all_data["Other"] = (all_data["Exterior1st_Other"] | all_data["Exterior2nd_Other"]).astype(int)
all_data["Plywood"] = (all_data["Exterior1st_Plywood"] | all_data["Exterior2nd_Plywood"]).astype(int)
all_data["Stone"] = (all_data["Exterior1st_Stone"] | all_data["Exterior2nd_Stone"]).astype(int)
all_data["Stucco"] = (all_data["Exterior1st_Stucco"] | all_data["Exterior2nd_Stucco"]).astype(int)
all_data["VinylSd"] = (all_data["Exterior1st_VinylSd"] | all_data["Exterior2nd_VinylSd"]).astype(int)
all_data["Wd Sdng"] = (all_data["Exterior1st_Wd Sdng"] | all_data["Exterior2nd_Wd Sdng"]).astype(int)
all_data["WdShing"] = (all_data["Exterior1st_WdShing"] | all_data["Exterior2nd_WdShing"]).astype(int)
all_data = all_data.drop("Exterior1st_AsbShng", axis = 1)
all_data = all_data.drop("Exterior1st_AsphShn", axis = 1)
all_data = all_data.drop("Exterior1st_BrkComm", axis = 1)
all_data = all_data.drop("Exterior1st_BrkFace", axis = 1)
all_data = all_data.drop("Exterior1st_CBlock", axis = 1)
all_data = all_data.drop("Exterior1st_CemntBd", axis = 1)
all_data = all_data.drop("Exterior1st_HdBoard", axis = 1)
all_data = all_data.drop("Exterior1st_ImStucc", axis = 1)
all_data = all_data.drop("Exterior1st_MetalSd", axis = 1)
all_data = all_data.drop("Exterior1st_Other", axis = 1)
all_data = all_data.drop("Exterior1st_Plywood", axis = 1)
all_data = all_data.drop("Exterior1st_Stone", axis = 1)
all_data = all_data.drop("Exterior1st_Stucco", axis = 1)
all_data = all_data.drop("Exterior1st_VinylSd", axis = 1)
all_data = all_data.drop("Exterior1st_Wd Sdng", axis = 1)
all_data = all_data.drop("Exterior1st_WdShing", axis = 1)
all_data = all_data.drop("Exterior2nd_AsbShng", axis = 1)
all_data = all_data.drop("Exterior2nd_AsphShn", axis = 1)
all_data = all_data.drop("Exterior2nd_BrkComm", axis = 1)
all_data = all_data.drop("Exterior2nd_BrkFace", axis = 1)
all_data = all_data.drop("Exterior2nd_CBlock", axis = 1)
all_data = all_data.drop("Exterior2nd_CemntBd", axis = 1)
all_data = all_data.drop("Exterior2nd_HdBoard", axis = 1)
all_data = all_data.drop("Exterior2nd_ImStucc", axis = 1)
all_data = all_data.drop("Exterior2nd_MetalSd", axis = 1)
all_data = all_data.drop("Exterior2nd_Other", axis = 1)
all_data = all_data.drop("Exterior2nd_Plywood", axis = 1)
all_data = all_data.drop("Exterior2nd_Stone", axis = 1)
all_data = all_data.drop("Exterior2nd_Stucco", axis = 1)
all_data = all_data.drop("Exterior2nd_VinylSd", axis = 1)
all_data = all_data.drop("Exterior2nd_Wd Sdng", axis = 1)
all_data = all_data.drop("Exterior2nd_WdShing", axis = 1)

In [17]:
#same for BsmtFinType1 and BsmtFinType2
all_data["GLQ"] = (all_data["BsmtFinType1_GLQ"] | all_data["BsmtFinType2_GLQ"]).astype(int)
all_data["ALQ"] = (all_data["BsmtFinType1_ALQ"] | all_data["BsmtFinType2_ALQ"]).astype(int)
all_data["BLQ"] = (all_data["BsmtFinType1_BLQ"] | all_data["BsmtFinType2_BLQ"]).astype(int)
all_data["Rec"] = (all_data["BsmtFinType1_Rec"] | all_data["BsmtFinType2_Rec"]).astype(int)
all_data["LwQ"] = (all_data["BsmtFinType1_LwQ"] | all_data["BsmtFinType2_LwQ"]).astype(int)
all_data["Unf"] = (all_data["BsmtFinType1_Unf"] | all_data["BsmtFinType2_Unf"]).astype(int)
all_data = all_data.drop("BsmtFinType1_GLQ", axis = 1)
all_data = all_data.drop("BsmtFinType1_ALQ", axis = 1)
all_data = all_data.drop("BsmtFinType1_BLQ", axis = 1)
all_data = all_data.drop("BsmtFinType1_Rec", axis = 1)
all_data = all_data.drop("BsmtFinType1_LwQ", axis = 1)
all_data = all_data.drop("BsmtFinType1_Unf", axis = 1)
all_data = all_data.drop("BsmtFinType2_GLQ", axis = 1)
all_data = all_data.drop("BsmtFinType2_ALQ", axis = 1)
all_data = all_data.drop("BsmtFinType2_BLQ", axis = 1)
all_data = all_data.drop("BsmtFinType2_Rec", axis = 1)
all_data = all_data.drop("BsmtFinType2_LwQ", axis = 1)
all_data = all_data.drop("BsmtFinType2_Unf", axis = 1)

Drop another irrelevant feature

In [18]:
#all_data = all_data.drop('ImStucc', 1)
#all_data = all_data.drop('Stone', 1)
all_data = all_data.drop('Other', axis = 1)
#all_data = all_data.drop('CBlock', 1)

For 0s in Garage and Basement, introduce new features - NoGarage and NoBasement

In [19]:
all_data['NoGarage'] = 0
all_data['NoBasement'] = 0
for i in range(0,all_data.shape[0]):
    if ((all_data.iloc[i]['GarageType_0'] == 1) and (all_data.iloc[i]['GarageFinish_0'] == 1) 
        and (all_data.iloc[i]['GarageCond'] == 0) and (all_data.iloc[i]['GarageQual'] == 0)):
        all_data.loc[i,'NoGarage'] = 1
    if ((all_data.iloc[i]['BsmtQual'] == 0) and (all_data.iloc[i]['BsmtCond'] == 0) and (all_data.iloc[i]['BsmtExposure'] == 0)):
        all_data.loc[i,'NoBasement'] = 1
    

Find outliers:
(I actually removed them straight in the data file beforehand )

In [20]:
#find outliers and remember them (we can hardcode as we can only run this once)
train = all_data[:data_train.shape[0]]
y=data_train["SalePrice"]
import statsmodels.api as sm
model = sm.OLS(y,train)
results = model.fit()
print 'hi'
bonf_test = results.outlier_test()['bonf(p)']
bonf_outlier = list(bonf_test[bonf_test<1e-3].index) 
print(bonf_test[bonf_test<1e-3])

Unskew the most skewed columns:

In [20]:
cols_skew = [col for col in all_data if '_2num' in col or '_' not in col]
all_data[cols_skew].skew()
cols_unskew = all_data[cols_skew].columns[abs(all_data[cols_skew].skew()) > 1]
for col in cols_unskew:
    all_data[col] = np.log1p(all_data[col])

Separate data back to train, test and target:

In [21]:
train = all_data[:data_train.shape[0]]
test = all_data[data_train.shape[0]:]
y=data_train["SalePrice"]

Separate data for cross-validation:

In [22]:
X_train, x_test, y_train, y_test = model_selection.train_test_split(train, y, test_size=0.33, random_state=42)


# MODELS

Try base models:

In [23]:
model = Lasso(max_iter = 1000)
print "Computing..."
print model.fit(X_train, y_train)
#print model.fit(X_train, y_train, sample_weight=None)
predict_cv = model.predict(x_test)
acc=rmsle(y_test.values, predict_cv)
print acc

Computing...
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
0.190865190745




In [24]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(max_iter = 1000000)
print "Computing..."
print model.fit(X_train, y_train)
#print model.fit(X_train, y_train, sample_weight=None)
predict_cv = model.predict(x_test)
acc=rmsle(y_test.values, predict_cv)
print acc

Computing...
ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
0.171554443853


In [25]:
rf_test = RandomForestRegressor(max_depth=30, n_estimators=500, max_features = 200, oob_score=True, random_state=1234)
rf_test.fit(X_train, y_train)
predict_cv = rf_test.predict(x_test)
acc=rmsle(y_test.values, predict_cv)
print acc

0.145140301191


In [26]:
from sklearn.ensemble import GradientBoostingRegressor
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost.fit(X_train, y_train)
predict_cv = GBoost.predict(x_test)
acc=rmsle(y_test.values, predict_cv)
print acc

0.128163280694


In [27]:
cls = catboost.CatBoostRegressor(learning_rate=0.1, iterations=1000, random_seed=0)
print "Computing..."
print cls.fit(X_train, y_train)
#print model.fit(X_train, y_train, sample_weight=None)
predict_cv = cls.predict(x_test)
acc=rmsle(y_test.values, predict_cv)
print acc

Computing...
<catboost.core.CatBoostRegressor object at 0x000000001ED3F4E0>
0.126596081442


In [28]:
#cross-validation
kf = KFold(n_splits=5)
cls = catboost.CatBoostRegressor(learning_rate=0.1, iterations=1000, random_seed=0)
accuracy = 0
i = 1
t = train.values
for tr_index, te_index in kf.split(t):
    X_tr, X_te = t[tr_index], t[te_index]
    y_tr, y_te = y[tr_index], y[te_index]
    print 'Running fold [%s] ...' % (i)
    cls.fit(X_tr, y_tr)
    acc = rmsle(y_te.values, cls.predict(X_te))
    accuracy += acc
    i+=1
    
accuracy = accuracy/3
print accuracy

Running fold [1] ...
Running fold [2] ...
Running fold [3] ...
Running fold [4] ...
Running fold [5] ...
0.203253334096


## Stacking?

In [29]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models, meta_model, n_folds=5):
        self.models = models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, X, y):
        self.models_ = [list() for x in self.models]
        self.meta_model_ = clone(self.meta_model)

        kfold = KFold(n_splits=self.n_folds, shuffle=True)
        
        # Train base models then create out-of-fold predictions for training meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.models)))
        for i, clf in enumerate(self.models):
            for train_idx, holdout_idx in kfold.split(X, y):
                instance = clone(clf)
                self.models_[i].append(instance)

                instance.fit(X[train_idx], y[train_idx])
                y_pred = instance.predict(X[holdout_idx])
                out_of_fold_predictions[holdout_idx, i] = y_pred
                
        # Now train meta-model using the out-of-fold predictions
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in models]).mean(axis=1)
            for models in self.models_ ])
        return self.meta_model_.predict(meta_features)

In [31]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
cls = catboost.CatBoostRegressor(learning_rate=0.1, iterations=1000, random_seed=0)



stacked_averaged_models = StackingAveragedModels((ENet, GBoost, cls),lasso)

stacked_averaged_models.fit(X_train.values, y_train.values)
stacked_train_pred = stacked_averaged_models.predict(x_test)
stacked_pred = (stacked_averaged_models.predict(test.values))
print(rmsle(y_test, stacked_train_pred))

0.123276060914


In [19]:
cls = catboost.CatBoostRegressor(learning_rate=0.1, iterations=1000, random_seed=0)
cls.fit(train,y)
k = cls.predict(test)
k = pd.DataFrame(k, columns=['SalePrice']).to_csv('submit.csv')

In [19]:
m1 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\results\\11756.csv')
m2 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\results\\11510.csv')
m3 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\results\\11590.csv')
m4 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\11633.csv')
m5 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\results\\11504.csv')
m6 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\results\\11508.csv')
m7 = pd.read_csv('C:\\Users\\Kateryna\\Desktop\\kaggle\\Housing\\results\\111512.csv')

g = (m1['SalePrice'] + m2['SalePrice'] + m3['SalePrice'] + m4['SalePrice'] + m5['SalePrice'] + m6['SalePrice']
    +   m7['12016'])/7
pd.DataFrame(g, columns=['SalePrice']).to_csv('submit.csv')

In [19]:
fff = pd.DataFrame(all_data).to_csv('all_data.csv')