In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.pandas.set_option('display.max_rows',None)

In [2]:
df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [3]:
df.head()

In [4]:
features_na = [features for features in df.columns if df[features].isnull().sum()>1]
for feature in features_na:
    print(feature,np.round(df[feature].isnull().mean(),4))

In [5]:
for feature in features_na:
    dfnew = df.copy()
    dfnew[feature] = np.where(dfnew[feature].isnull(),1,0 )

    dfnew.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

## Numerical variables

In [6]:
numerical = [features for features in df.columns if df[features].dtypes != "O"]
print(len(numerical))
df[numerical].head()

In [7]:
year_variables = [features for features in numerical if 'Yr' in features or 'Year' in features]
year_variables

In [8]:
print('YrSold',df['YrSold'].unique())

In [9]:
df.groupby('YrSold')['SalePrice'].median().plot()
plt.title('YrSold')
plt.show()

In [10]:
for year in year_variables:
    if year != 'YrSold':
        data = df.copy()
        data[year] = data['YrSold'] - data[year]
        plt.scatter(data[year],data['SalePrice'])
        plt.xlabel(year)
        plt.ylabel('SalePrice')
        plt.title(feature+' SalePrice')
        plt.show()

### Discrete variables

In [11]:
discrete_feature = [features for features in numerical if len(df[features].unique())<25 and (features not in year_variables + ['Id'])]
print(len(discrete_feature))

In [12]:
for i in discrete_feature:
    data = df.copy()
    
    data.groupby(i)['SalePrice'].median().plot.bar()
    plt.xlabel(i)
    plt.ylabel('SalePrice')
    plt.title('Saleprice vs '+i )
    plt.show()

### Continuous features

In [14]:
cont_features = [features for features in numerical if features not in discrete_feature + year_variables + ['Id']]
len(cont_features)

In [15]:
for i in cont_features:
    data = df.copy()
    if 0 in data[i].unique():
        pass
    else:
        data[i] = np.log(data[i])
        data['SalePrice'] = np.log(data['SalePrice'])
        plt.scatter(data[i],data['SalePrice'])
        plt.xlabel(i)
        plt.ylabel('SalePrice')
        plt.show()

## Outliers

In [16]:
for i in cont_features:
    data = df.copy()
    if 0 in data[i].unique():
        pass
    else:
        data[i] = np.log(data[i])
        data.boxplot(column = i)
        plt.title(i)
        plt.show()

## Categorical variables

In [17]:
categorical = [feature for feature in df.columns if df[feature].dtypes == 'O']
print(len(categorical))

In [18]:
for i in categorical:
    print("Feature {} has {} unique values".format(i,len(df[i].unique())))

In [19]:
for i in categorical:
    data = df.copy()
    
    data.groupby(i)['SalePrice'].median().plot.bar()
    plt.xlabel(i)
    plt.ylabel('SalePrice')
    plt.show()

In [20]:
sns.heatmap(df.isnull(),yticklabels = False,cbar = False)

In [3]:
df.isnull().sum()

In [4]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())

In [5]:
df['BsmtFinType1'].value_counts()

In [6]:
df.drop('Alley',axis = 1,inplace = True)

In [None]:
df.shape

# Feature Engineering

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [26]:
data = df.copy()
data.drop('SalePrice',axis = 1)
X_train, X_test, y_train, y_test = train_test_split(data,df['SalePrice'],test_size = 0.1)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Missing values

In [8]:
categorical = [features for features in df.columns if df[features].dtypes == 'O'\
               and df[features].isnull().sum()>=1]
for i in categorical:
    print(i,"Percentage of missing values","{:.2%}".format(df[i].isnull().sum()/df.shape[0]))

In [9]:
for i in categorical:
    #data = dataset.copy()
    df[i] = df[i].fillna('Missing')
for i in categorical:
    print(i,"Percentage of missing values","{:.2%}".format(df[i].isnull().sum()/df.shape[0]))

### Numerical

In [10]:
numerical = [features for features in df.columns if df[features].dtypes != 'O' \
             and df[features].isnull().sum()>1]
for i in numerical:
    print(i,"Percentage of missing values","{:.2%}".format(df[i].isnull().sum()/df.shape[0]))

In [11]:
for i in numerical:
    #data = dataset.copy()
    m = df[i].median()
    df[i+'NAN'] = np.where(df[i].isnull(),1,0)
    df[i].fillna(m, inplace = True)
for i in numerical:
    print(i,"Percentage of missing values","{:.2%}".format(df[i].isnull().sum()/df.shape[0]))

In [12]:
year_variables = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
for i in year_variables:
    df[i] = df['YrSold'] - df[i]

In [56]:
df[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

### Continuous Features

In [13]:
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in num_features:
    df[feature]=np.log(df[feature])

## Rare Categorical Variables

We are going to remove the categorical variables, that contribute to only 1% of the observations

In [15]:
#extracting the categorical features
categorical_features = [features for features in df.columns if df[features].dtype == 'O']
len(categorical_features)

In [16]:
for feature in categorical_features:
    temp = df.groupby(feature)['SalePrice'].count()/len(df)
    temp_df = temp[temp>0.01].index
    df[feature] = np.where(df[feature].isin(df),df[feature],\
                                'Rare variable')

In [17]:
for feature in categorical_features:
    labels_ordered=df.groupby([feature])['SalePrice'].count().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    df[feature]=df[feature].map(labels_ordered)

In [None]:
df.shape

# Feature Scaling

In [20]:
df.drop(['GarageYrBltNAN','MasVnrAreaNAN'],axis=1,inplace=True)

In [21]:
#Features, which we want to scale
scale_features = [features for features in df.columns if features not in ['Id','SalePrice']]
len(scale_features)

# Test dataset

In [27]:
test_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
test_df.shape

In [None]:
sns.heatmap(test_df.isnull())

In [28]:
test_df['MSZoning'] = test_df['MSZoning'].fillna(test_df['MSZoning'].mode()[0])

In [None]:
null_features = [features for features in test_df.columns if test_df[features].\
                 isnull().sum()>1]
for features in null_features:
    print(features,100*np.round(test_df[features].isnull().mean(),4),'% of missing values')

## Feature Engineering

In [29]:
null_cat_features = [features for features in test_df.columns if test_df[features].isnull().\
                     sum()>1\
                 and test_df[features].dtypes == 'O']
for features in null_cat_features:
    print(features,100*np.round(test_df[features].isnull().mean(),4),'% of missing values')

In [30]:
def replace_nan(dataset,features):
    data = dataset.copy()
    data[features] = data[features].fillna('Missing')
    return data
test_df = replace_nan(test_df,null_cat_features)
print(test_df[null_cat_features].isnull().sum())

In [31]:
numerical_features = [features for features in test_df.columns if test_df[features].isnull().\
                      sum()>1 and test_df[features].dtypes != 'O']
for features in numerical_features:
    print(features,100*np.round(test_df[features].isnull().mean(),4),' % of missing values')

In [32]:
for feature in numerical_features:
    m = test_df[feature].median()
    d = np.where(test_df[feature].isnull(),1,0)
    test_df[feature+'na'] = d
    test_df[feature].fillna(m,inplace = True)
test_df[numerical_features].isnull().sum()

In [33]:
year_features = [feature for feature in test_df.columns if 'Year' in feature or \
                 'Yr' in feature]
year_features

In [34]:
year_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
for feature in year_features:
    test_df[feature] = test_df['YrSold'] - test_df[feature]

In [35]:
test_df[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']].head()

In [36]:
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']

for feature in num_features:
    test_df[feature]=np.log(test_df[feature])

In [37]:
categorical_features=[feature for feature in test_df.columns if test_df[feature].dtype=='O']

In [38]:
for feature in categorical_features:
    temp = test_df.groupby(feature)[feature].count()/len(test_df)
    temp_df = temp[temp>0.01].index
    test_df[feature] = np.where(test_df[feature].isin(temp_df),test_df[feature],\
                                'Rare variable')

In [39]:
for feature in categorical_features:
    labels_ordered=test_df.groupby([feature])[feature].count().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    test_df[feature]=test_df[feature].map(labels_ordered)

# Creating the model

In [41]:
#Features, which we want to scale
scale_features = [features for features in df.columns if features not in ['Id','SalePrice']]
len(scale_features)

In [42]:
#Normalizing the data
scaler = MinMaxScaler()
scaler.fit(df[scale_features])

In [43]:
#Concatenation of the scaled and not-scaled features
df = pd.concat([df[['Id','SalePrice']].reset_index(drop = True),
                     pd.DataFrame(scaler.transform(df[scale_features]),\
                                  columns = scale_features)],axis = 1)

In [44]:
#saving the file
df.to_csv('X_train.csv', index = False)

In [45]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [46]:
#Concatenation of the scaled and not-scaled features
test_df_new = pd.concat([test_df['Id'].reset_index(drop = True),
                     pd.DataFrame(scaler.transform(test_df[scale_features]),\
                                  columns = scale_features)],axis = 1)

In [47]:
#Retrieving dependent variable
y = df['SalePrice']

In [48]:
#Retrieving features
X = df.drop(['Id','SalePrice'],axis = 1)

In [49]:
#Fitting the Lasso Regression model
select = SelectFromModel(Lasso(alpha = 0.005, random_state = 0))
select.fit(X,y)

In [50]:
#Selected features
selected_feat = X.columns[select.get_support()]

In [51]:
print("Original features {}".format(X.shape[1]))
print("Selected features {}".format(len(selected_feat)))

In [52]:
#Taking dataset with only selected features
X = X[selected_feat]

In [53]:
X.shape

In [54]:
X_test = test_df_new.drop(['Id'],axis = 1)

In [55]:
X_test = test_df_new[selected_feat]

In [56]:
X_test.shape

In [58]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
## Replace the missing value with the most common one
for column in X_test.columns:
    X_test[column].fillna(X_test[column].mode()[0], inplace=True)

In [64]:
regr = RandomForestRegressor(random_state=0)
regr.fit(X, y)
y_test = np.exp(regr.predict(X_test))
y_test_df = pd.DataFrame(y_test)
y_test_df.columns = ['SalePrice']
df_output = pd.concat([test_df_new['Id'],y_test_df],\
                      axis=1)
df_output.to_csv('submission.csv',index=False)

In [65]:
regr = GradientBoostingRegressor(random_state=0)
regr.fit(X, y)
y_test = np.exp(regr.predict(X_test))
y_test_df = pd.DataFrame(y_test)
y_test_df.columns = ['SalePrice']
df_output = pd.concat([test_df_new['Id'],y_test_df],\
                      axis=1)
df_output.to_csv('submission.csv',index=False)