In [2]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler , OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.preprocessing import PowerTransformer
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
train = pd.read_csv('train.csv').drop('Id',axis = 1 )
test = pd.read_csv('test.csv').drop('Id',axis = 1 )
test_Y = pd.read_csv('sample_submission.csv').drop("Id", axis = 1)

In [6]:
train["SalePrice"] = np.log1p(train["SalePrice"])
y_test = np.log1p(test_Y)

In [7]:
train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<300000)].index, inplace=True)
train.reset_index(drop=True, inplace=True)

In [8]:
train_labels = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

# Объединяем тестовую и тренировочную выборку
all_features = pd.concat([train_features, test_features]).reset_index(drop=True)
all_features.shape

(2917, 79)

In [9]:
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    
    return dict_x

missing = percent_missing(all_features)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:10]

Percent of missing data


[('PoolQC', 99.69),
 ('MiscFeature', 96.4),
 ('Alley', 93.21),
 ('Fence', 80.43),
 ('FireplaceQu', 48.68),
 ('LotFrontage', 16.66),
 ('GarageYrBlt', 5.45),
 ('GarageFinish', 5.45),
 ('GarageQual', 5.45),
 ('GarageCond', 5.45)]

In [10]:
all_features['MSSubClass'] = all_features['MSSubClass'].apply(str)
all_features['YrSold'] = all_features['YrSold'].astype(str)
all_features['MoSold'] = all_features['MoSold'].astype(str)
all_features['YearRemodAdd'] = all_features['YearRemodAdd'].astype(str)
all_features['YearBuilt'] = all_features['YearBuilt'].astype(str)

In [11]:
def handle_missing(features):
    # the data description states that NA refers to typical ('Typ') values
    features['Functional'] = features['Functional'].fillna('Typ')
    # Replace the missing values in each of the columns below with their mode
    features['Electrical'] = features['Electrical'].fillna("SBrkr")
    features['KitchenQual'] = features['KitchenQual'].fillna("TA")
    features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
    features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
    features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
    features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    
    # the data description stats that NA refers to "No Pool"
    features["PoolQC"] = features["PoolQC"].fillna("None")
    # Replacing the missing values with 0, since no garage = no cars in garage
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        features[col] = features[col].fillna(0)
    # Replacing the missing values with None
    for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        features[col] = features[col].fillna('None')
    # NaN values for these categorical basement features, means there's no basement
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        features[col] = features[col].fillna('None')
        
    # Group the by neighborhoods, and fill in missing value by the median LotFrontage of the neighborhood
    features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

    # We have no particular intuition around how to fill in the rest of the categorical features
    # So we replace their missing values with None
    objects = []
    for i in features.columns:
        if features[i].dtype == object:
            objects.append(i)
    features.update(features[objects].fillna('None'))
        
    # And we do the same thing for numerical features, but this time with 0s
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numeric.append(i)
    features.update(features[numeric].fillna(0))    
    return features

all_features = handle_missing(all_features)

In [12]:
# Fetch all numeric features
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in all_features.columns:
    if all_features[i].dtype in numeric_dtypes:
        numeric.append(i)

In [13]:
# Find skewed numerical features
skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(10)

There are 25 numerical features with Skew > 0.5 :


MiscVal          21.939672
PoolArea         17.688664
LotArea          13.109495
LowQualFinSF     12.084539
3SsnPorch        11.372080
KitchenAbvGr      4.300550
BsmtFinSF2        4.144503
EnclosedPorch     4.002344
ScreenPorch       3.945101
BsmtHalfBath      3.929996
dtype: float64

In [16]:
mmx = MinMaxScaler()

In [17]:
# Normalize skewed features
for i in skew_index:
    all_features[i] = boxcox1p(all_features[i], boxcox_normmax(all_features[i] + 1))

In [18]:
for col in all_features.columns[all_features.dtypes != 'O']:
    all_features[col] = mmx.fit_transform(all_features[col].values.reshape(-1,1))

In [20]:
all_features['MSSubClass'] = all_features['MSSubClass'].apply(float)
all_features['YrSold'] = all_features['YrSold'].astype(float)
all_features['MoSold'] = all_features['MoSold'].astype(float)
all_features['YearRemodAdd'] = all_features['YearRemodAdd'].astype(float)
all_features['YearBuilt'] = all_features['YearBuilt'].astype(float)

In [21]:
all_features = pd.get_dummies(all_features).reset_index(drop=True)
all_features.shape

(2917, 302)

In [22]:
all_features = all_features.loc[:,~all_features.columns.duplicated()]

In [None]:
all 

In [23]:
all_features.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60.0,0.240646,0.310809,0.666667,0.535996,2003.0,2003.0,0.389766,0.291912,0.0,...,0,0,0,1,0,0,0,0,1,0
1,20.0,0.305353,0.334117,0.555556,0.888115,1976.0,1976.0,0.0,0.367996,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60.0,0.254037,0.363492,0.666667,0.535996,2001.0,2002.0,0.356529,0.223753,0.0,...,0,0,0,1,0,0,0,0,1,0
3,70.0,0.217757,0.333158,0.666667,0.535996,1915.0,1970.0,0.0,0.125261,0.0,...,0,0,0,1,1,0,0,0,0,0
4,60.0,0.321749,0.408246,0.777778,0.535996,2000.0,2000.0,0.508896,0.276748,0.0,...,0,0,0,1,0,0,0,0,1,0


In [24]:
all_features.columns[all_features.isna().sum()>0]

Index([], dtype='object')

In [25]:
X_train = all_features.iloc[:len(train_labels), :]
X_test = all_features.iloc[len(train_labels):, :]
X_train.shape, train_labels.shape, X_test.shape

((1458, 302), (1458,), (1459, 302))

In [26]:
LR = LinearRegression()
LR.fit(X_train,train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:

y_pred = LR.predict(X_test)

In [28]:
y_pred

array([11.69963799, 12.02072757, 12.15148566, ..., 12.10311818,
       11.6932335 , 12.29492833])

In [29]:
LR.score(X_test,y_test.values)

-35.21370173750692