In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor , GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
df_train = pd.read_csv('./all/train.csv')
df_test = pd.read_csv('./all/test.csv')
df = pd.concat((df_train.loc[:, 'Id' : 'SaleCondition'], df_test.loc[:, 'Id' : 'SaleCondition']))
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [3]:
train_min = min(df_train.Id)
train_max = max(df_train.Id)
test_min = min(df_test.Id)
test_max = max(df_test.Id)

In [4]:
df.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
1458,2919,60,RL,74.0,9627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2006,WD,Normal


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-

In [6]:
def apply_preprocessing_to_data(df, drop_threshold, drop_first, dummy):
    
    
    cat_variables = []
    num_variables = []
    columns_to_be_dropped=[]
    
    empty_columns = df.columns[df.isna().any()].tolist()
    for i in empty_columns:
        add = df[i].isnull().sum()
        if((add / len(df) *100) > drop_threshold):
            columns_to_be_dropped.append(i)

    first_column = df.columns[0]
    
    if(drop_first):
        df =df.drop(axis =1 , columns= df.columns[0])

    df =  df.drop(axis = 1 , columns = columns_to_be_dropped)
    
    
    
    
    for column in df.columns:
        if(df[column].dtype == 'O'):
            cat_variables.append(column)
        else:
            num_variables.append(column)
    
    for i in cat_variables:
        if i in empty_columns:
            top_variable = df[i].describe().top
            df[i] = df[i].fillna(top_variable)

    for i in num_variables:
        if i in empty_columns:
            mean_ = df[i].mean()
            df[i] = df[i].fillna(mean_)

    if(dummy):
        df = pd.get_dummies(df, columns = cat_variables)
        
    return df


In [7]:
df = apply_preprocessing_to_data(df, 50 , False, True)
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706.0,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978.0,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486.0,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216.0,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655.0,...,0,0,0,1,0,0,0,0,1,0


In [8]:
def Get_X_y(df, target_variable, train_max, train_min, test_min, test_max):
    X_train  = df[((df.Id >= train_min) & (df.Id <= train_max))]
    y_train =  df_train[target_variable]
    
    X_test = df[((df.Id >= test_min) & (df.Id <= test_max))]
    
    X_train  =X_train.drop(columns = ['Id'])
    X_test  =X_test.drop(columns = ['Id'])
    
    X = np.array(X_train)
    y = np.array(y_train)
    
    return X,y, X_test

In [9]:
X ,y, X_test = Get_X_y(df, 'SalePrice', train_max, train_min, test_min, test_max)
X.shape, y.shape, X_test.shape

((1460, 275), (1460,), (1459, 275))

In [10]:

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.1, shuffle=True)
X_train.shape

(1314, 275)

In [11]:
def apply_advanced_preprocessing( pca, pca_variance , X_train, X_valid, X_test, scale=True):
    if(scale):
        min_max = MinMaxScaler()
        min_max.fit(X_train)
        X_train = min_max.transform(X_train)
        X_valid = min_max.transform(X_valid)
        X_test = min_max.transform(X_test)
    
    if(pca):
        pca = PCA(pca_variance).fit(X_train)
        X_train =  pca.transform(X_train)
        X_valid = pca.transform(X_valid)
        X_test = pca.transform(X_test)
        
    return X_train, X_valid, X_test

In [12]:
X_train, X_valid, X_test = apply_advanced_preprocessing(True, 0.95, X_train, X_valid, X_test, True)


In [13]:
X_train.shape, X_test.shape

((1314, 99), (1459, 99))

In [14]:
??mean_squared_log_error

In [15]:
y_true = [3, 5, 2.5, 7]
y_pred = [2.5, 5, 4, 8]

y_log_true = np.log(y_true)
y_pred_true = np.log(y_pred)

y_diff = y_log_true - y_pred_true

np.sum(y_diff ** 2) / 4

0.0679937985980446

In [16]:
def err(y_test, y_pred):
    return mean_squared_log_error(y_test, y_pred)

In [17]:
def GridSearch(classifier):
    Cs = [0.001, 0.01, 0.1, 1, 10,1000]
    gammas = [0.001, 0.01, 0.1, 1, 10]
    param_grid = {'C': Cs, 'gamma' : gammas}

    grid_search = GridSearchCV(classifier, param_grid)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_params_


In [18]:
GridSearch(SVR())

{'C': 1000, 'gamma': 0.01}

In [19]:
def GridSearch_New(classifier):
    Cs = [0.001, 0.01, 0.1, 1, 10,1000]
    gammas = [0.001, 0.01, 0.1, 1, 10]
    param_grid = {'C': Cs, 'gamma' : gammas}

    grid_search = GridSearchCV(classifier, param_grid)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_params_

In [23]:
ada = AdaBoostRegressor().fit(X_train, y_train)
gra = GradientBoostingRegressor().fit(X_train, y_train)
rf = RandomForestRegressor().fit(X_train, y_train)
xgb = XGBRegressor().fit(X_train,y_train)

In [24]:
svr = SVR(C=1000, gamma=0.01)

In [25]:
svr.fit(X_train, y_train)

SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [45]:
def predict_classifier(classifier_list,X,y_valid=None):
    pred = []
    for classifier in classifier_list:
        y_pred = classifier.predict(X)
        if(y_valid is not None):
            print(err(y_valid, y_pred))
        pred.append(y_pred)
    return np.mean(pred, axis = 0)

In [47]:
final_pred = predict_classifier ([svr , ada, gra, rf, xgb], X_valid,y_valid)

0.09750143836138983
0.0769389184300332
0.05199580415792052
0.048055740809842366
0.049857040343207225


In [49]:
print(err(y_valid, final_pred))

0.05311327332293477


In [50]:
y_final_pred = predict_classifier([svr , ada, gra, rf, xgb] , X_test)

In [51]:
ss = pd.read_csv('./all/sample_submission.csv')
ss.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [52]:
ss.loc[:, 'SalePrice']  = y_final_pred

In [53]:
y_final_pred

array([125131.50246271, 145375.1207053 , 195136.2736285 , ...,
       147061.31980079, 146524.91185276, 206331.49736134])

In [54]:
ss.to_csv('version2.csv',index=False)