In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib.inline
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
train_data_path = r'data\house_price_pred\train.csv'
test_data_path = r'data\house_price_pred\test.csv'

In [None]:
df = pd.read_csv(train_data_path)
df.shape

# Exploratory Data Analysis

## Missing Values

In [None]:
pd.pandas.set_option('display.max_rows', None)

In [None]:
df.isnull().mean()

In [None]:
features_with_na = [feature for feature in df.columns if df[feature].isnull().sum() > 1]
len(features_with_na), features_with_na

In [None]:
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(), 3), "% missing")

Find the relationship b/w missing values and target (sales price)

In [None]:
df.columns

In [None]:
for feature in features_with_na:
    df_cp = df.copy()
    df_cp[feature] = np.where(df_cp[feature].isnull(), 1, 0)
    
    df_cp.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

## Find Numerical variables

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
print(len(numerical_features))
df[numerical_features].head(2)

## Handle Temporal features (Data Time features)

In [None]:
year_features = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]
year_features

In [None]:
df[year_features].head(2)

In [None]:
for feature in year_features:
    print(feature, df[feature].unique())

In [None]:
df.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('Sales Price')
plt.show()

## Analyze Year Sold vs other year features

In [None]:
for feature in year_features:
    data_cp = df.copy()
    if feature != 'YrSold':
        data_cp[feature] = data_cp.YrSold - data_cp[feature]
        plt.scatter(data_cp[feature], data_cp.SalePrice)
        plt.show()

## Numerical features can be 1) Contineous 2) Discrete

## Analyze Discrete numerical features

In [None]:
df.columns

In [None]:
type(year_features)

In [None]:
discrete_features = [feature for feature in numerical_features if len(df[feature].unique()) < 25 and feature not in year_features + ['Id']] 
discrete_features

In [None]:
for feature in discrete_features:
    df.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('Sale Price')
    plt.show()

## Analyze Contineous numerical features

In [None]:
cont_features = [feature for feature in numerical_features if feature not in discrete_features + year_features + ['Id']]
cont_features

It is contineous features so plot histogram

In [None]:
for feature in cont_features:
    data_cp = df.copy()
    
    data_cp[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.show()

## Except 'SalePrice' (Gaussian dist.) all other contineous features are skewed. So, need to do logarthmic transformation

In [None]:
for feature in cont_features:
    data = df.copy()
    # print(data[feature].unique())
    if 0 in data[feature].unique():
        # print('True')
        pass
    else:
        data[feature] = np.log(data[feature])
        data.SalePrice = np.log(data.SalePrice)
        plt.scatter(data[feature], data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('Sale Price')
        plt.show()
        

## Handle Outliers

Note : Outlier exists only in 'Contineous' values not in Categorical values. For ex, Categorical Variable Sex has values like 9 Male and 1 Female. We cann't say here, Female is Outlier.

In [None]:
for feature in cont_features:
    data = df.copy()
    
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data.boxplot(feature)
        plt.ylabel(feature)
        plt.show()
        

## Handle Categorical Variables

In [None]:
cat_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
cat_features

In [None]:
for feature in cat_features:
    print(f'The {feature} has {len(df[feature].unique())} unique values')

## Check Categorical variables relationship with Target (SalePrice)

In [None]:
for feature in cat_features:
    data = df.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('Sale price')
    plt.show()

# Feature Engineering

## Handle missing values in Categorical features

In [None]:
cat_features_has_miss_values = [feature for feature in df.columns if df[feature].dtypes == 'O' and df[feature].isnull().sum() > 0]
cat_features_has_miss_values

In [None]:
for feature in cat_features_has_miss_values:
    print(f'Feature {feature} has {np.round(df[feature].isnull().mean(), 4)} % missing values')

In [None]:
def replace_cat_features(ds, nan_features):
    data = ds.copy()
    data[nan_features] = data[nan_features].fillna('Missing')
    return data

In [None]:
df_bkup = df.copy()

In [None]:
df = replace_cat_features(df, cat_features_has_miss_values)
df[cat_features_has_miss_values].isnull().sum()

In [None]:
df.head()

## Handling missing value in Numerical features

In [None]:
num_feature_with_miss_values = [feature for feature in df.columns if df[feature].dtypes != 'O' and df[feature].isnull().sum() > 0]
num_feature_with_miss_values


In [None]:
for feature in num_feature_with_miss_values:
    print(f'{feature} has {np.round(df[feature].isnull().mean(), 4)} % of missing values')

### replace numerical feature nulls.

In [None]:
for feature in num_feature_with_miss_values:
    #Find mean and update nan.
    median_value = df[feature].median() #Since, lot of outlier found in num. values in EDA
    df[feature].fillna(median_value, inplace=True)
    
    #create new feature for missing value features
    df[feature + '_nan'] = np.where(df[feature].isnull(), 1, 0)
    

In [None]:
df[num_feature_with_miss_values].isnull().sum()

In [None]:
df.columns

## Apply Log Normal Distribution on data skewed numerical features

In [None]:
num_features_has_skewed_data = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice' ]

In [None]:
df.head()

In [None]:
for feature in num_features_has_skewed_data:
    df[feature] = np.log(df[feature])

df.head()

In [None]:
for feature in num_features_has_skewed_data:
    df[feature].hist(bins=25)
    plt.show()

## Handling Rare Categorical Features


Categorical features : Categories < 1% of Total observations can be updated as 'Rare_Val'

In [None]:
cat_features1  = [feature for feature in df.columns if df[feature].dtypes == 'O']
cat_features1

In [None]:
for feature in cat_features1:
    temp = df.groupby(feature)['SalePrice'].count()/len(df)
    temp_df = temp[temp > 0.01].index
    df[feature] = np.where(df[feature].isin(temp_df), df[feature], 'Rare_val')
    # print(f'{feature}, {temp}, {temp_df}')
    # print('----')

# Feature Scaling

In [None]:
len(df.columns)

In [None]:
feature_scale = [feature for feature in df.columns if feature not in ['Id', 'SalePrice']]
len(feature_scale)

In [None]:
df.head()

## Label encoding

In [None]:
len(cat_features), len(cat_features1)

In [None]:
df.groupby('MSZoning')['SalePrice'].mean()

In [None]:
# labels_ordered = df.groupby('MSZoning')['SalePrice'].mean().sort_values().index
# labels_ordered = [{k:i} for i,k in enumerate(labels_ordered, 0)]
# labels_ordered

In [None]:
for feature in cat_features:
    labels_ordered = df.groupby(feature)['SalePrice'].mean().sort_values().index
    labels_ordered = {k:i for i, k in enumerate(labels_ordered, 0)}
    print(labels_ordered)
    df[feature] = df[feature].map(labels_ordered)
    


In [None]:
df[cat_features].head()

# Feature Scaling

In [None]:
feature_scale = [feature for feature in df.columns if feature not in ['Id', 'SalePrice']]
len(feature_scale)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[feature_scale])

In [None]:
df_scaled = pd.concat([df[['Id', 'SalePrice']].reset_index(drop=True),
                      pd.DataFrame(scaler.transform(df[feature_scale]), columns=feature_scale)],
                      axis=1)

df_scaled.head()

In [None]:
df_scaled.to_csv(r'data\house_price_pred\train_scaled.csv')

## Pickle the trained objects

In [None]:
import pickle

In [None]:
file_name = r'data\house_price_pred\scaler.pkl'

file_to_pickle = open(file_name, 'wb')
pickle.dump(scaler, file_to_pickle)
file_to_pickle.close()