## Exploratory data analysis

Read in and analyse training data, then save important information to JSON file for later use in feature engineering and model training.

In [None]:
# Imports

import pandas as pd
import copy

In [None]:
# read csv into dataframe
data = pd.read_csv('../data/raw/train.csv')
data.shape

test = pd.read_csv('../data/raw/test.csv')

In [None]:
print(data.shape)
print(test.shape)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
info = {}
info['columns'] = data.columns
info['data_types'] = data.dtypes 

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
info['missing_values'] = copy.deepcopy(missing_values)
rows_with_missing_values = data[data.isnull().any(axis=1)]
info['rows_with_missing_values'] = rows_with_missing_values
rows_with_missing_values.shape

In [None]:
# Number of cols with missing values
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
num_cols_missing_vals = missing_values.shape[0]
info['num_cols_missing_vals'] = num_cols_missing_vals
num_cols_missing_vals

## Data Editing

### No encoding, just remove cols with missing values

In [None]:
def remove_cols_with_missing_vals(train, test):
    # List cols with missing values in each df, combine the list and remove cols from both
    missing_vals_train = train.isnull().sum()
    missing_vals_train = missing_vals_train[missing_vals_train > 0]
    missing_vals_test = test.isnull().sum()
    missing_vals_test = missing_vals_test[missing_vals_test > 0]
    cols_to_remove = list(set(missing_vals_train.index) | set(missing_vals_test.index))
    _train = train.drop(cols_to_remove, axis=1)
    _test = test.drop(cols_to_remove, axis=1)
    return _train, _test

### One-Hot-Encoding

In [None]:
def one_hot_encode(train, test):
    non_numerical_cols = train.select_dtypes(exclude=['int64', 'float64']).columns
    test_non_numerical_cols = test.select_dtypes(exclude=['int64', 'float64']).columns  

    # Convert non-numerical columns to one-hot encoding
    one_hot_data = pd.get_dummies(train, columns=non_numerical_cols)
    one_hot_test = pd.get_dummies(test, columns=test_non_numerical_cols)

    # get column names in one_hot_data that are not in one_hot_test
    cols_not_in_test = one_hot_data.columns.difference(one_hot_test.columns)

    # add clumns to one_hot_test with default value of 0 except for SalePrice
    for col in cols_not_in_test:
        if col == 'SalePrice':
            pass
        else:
            one_hot_test[col] = 0

    return one_hot_data, one_hot_test

### Impute missing vals with column mean

In [None]:
def impute_with_mean(train, test, cols):
    # Calculate means for all numerical columns in both dataframes
    all_means = pd.concat([
        train[cols],
        test[cols]
    ]).mean()
    
    # Apply the calculated means to fill missing values in both dataframes
    _train = train.copy()
    _test = test.copy()
    _train[cols] = train[cols].fillna(all_means)
    _test[cols] = test[cols].fillna(all_means)
    
    return _train, _test


### Categorical

In [None]:
def to_categorical(train, test):
    # Identify non numerical columns in both dataframes
    non_numerical_cols_train = train.select_dtypes(exclude=['int64', 'float64']).columns.tolist()
    non_numerical_cols_test = test.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

    # Combine non numerical columns from both dataframes
    non_numerical_cols = list(set(non_numerical_cols_train) | set(non_numerical_cols_test))
    
    # Initialize copies to avoid modifying the original dataframes
    _train = train.copy()
    _test = test.copy()
    
    for col in non_numerical_cols:
        # Create a mapping from categories to integers for the current column
        unique_values_combined = pd.concat([_train[col], _test[col]]).unique()
        category_to_int = {value: idx for idx, value in enumerate(unique_values_combined)}
        
        # Apply the mapping to both dataframes
        _train[col] = _train[col].map(category_to_int)
        _test[col] = _test[col].map(category_to_int)
    
    return _train, _test, non_numerical_cols

In [None]:
def impute_categoricals_with_mode(train, test, categorical_cols):

    # Calculate mode for all categorical columns in both dataframes
    all_modes = pd.concat([
        train[categorical_cols],
        test[categorical_cols]
    ]).mode().iloc[0]

    # Apply the calculated modes to fill missing values in both dataframes
    _train = train.copy()
    _test = test.copy()
    _train[categorical_cols] = train[categorical_cols].fillna(all_modes)
    _test[categorical_cols] = test[categorical_cols].fillna(all_modes)

    return _train, _test


In [None]:
# Reorder columns in test df to match order of cols in train
def order_test_cols(train, test):
    # Get cols in train, omitting SalePrice
    train_cols = train.columns.tolist()
    train_cols.remove('SalePrice')

    # Reorder columns in test to match the order of train
    _test = test[train_cols]

    return _test

### Save Data

In [None]:
# Remove columns with missing values
train_no_missing_vals_cols_removed, test_no_missing_vals_cols_removed = remove_cols_with_missing_vals(data, test)
# save the data to csv
train_no_missing_vals_cols_removed.to_csv('../data/processed/train_no_missing_vals_cols_removed.csv', index=False)
test_no_missing_vals_cols_removed.to_csv('../data/processed/test_no_missing_vals_cols_removed.csv', index=False)
print(train_no_missing_vals_cols_removed.shape)
print(test_no_missing_vals_cols_removed.shape)

In [None]:
# One Hot Encoded
one_hot_data, one_hot_test = one_hot_encode(data, test)

# save to csv
one_hot_data.to_csv('../data/processed/train_one-hot-encoded.csv', index=False)
one_hot_test.to_csv('../data/processed/test_one-hot-encoded.csv', index=False)

print(one_hot_data.shape)
print(one_hot_test.shape)

In [None]:
# One Hot Cols Removed
one_hot_cols_removed_train, one_hot_cols_removed_test = remove_cols_with_missing_vals(one_hot_data, one_hot_test)

one_hot_cols_removed_test = order_test_cols(one_hot_cols_removed_train, one_hot_cols_removed_test)

one_hot_cols_removed_train.to_csv('../data/processed/train_one-hot-encoded_no-missing_vals_cols_removed.csv', index=False)
one_hot_cols_removed_test.to_csv('../data/processed/test_one-hot-encoded_no-missing_vals_cols_removed.csv', index=False)
print(one_hot_cols_removed_train.shape)
print(one_hot_cols_removed_test.shape)

In [None]:
# One Hot Mean Imputed
train_one_hot_imputed, test_one_hot_imputed = impute_with_mean(one_hot_data, one_hot_test, one_hot_data.select_dtypes(exclude=['int64', 'float64']).columns.tolist())

test_one_hot_imputed = order_test_cols(train_one_hot_imputed, test_one_hot_imputed)

# save to csv
train_one_hot_imputed.to_csv('../data/processed/train_one-hot-encoded_no-missing_vals_mean_imputed.csv', index=False)
test_one_hot_imputed.to_csv('../data/processed/test_one-hot-encoded_no-missing_vals_mean_imputed.csv', index=False)
print(train_one_hot_imputed.shape)
print(test_one_hot_imputed.shape)

In [None]:
# Categorical
train_categorical, test_categorical, categorical_cols = to_categorical(data, test)

test_categorical = order_test_cols(train_categorical, test_categorical)
# save to csv
train_categorical.to_csv('../data/processed/train_categorical.csv', index=False)
test_categorical.to_csv('../data/processed/test_categorical.csv', index=False)
print(train_categorical.shape)
print(test_categorical.shape)
train_categorical.head()

In [None]:
# Categorical cols removed
train_categorical_cols_removed, test_categorical_cols_removed = remove_cols_with_missing_vals(train_categorical, test_categorical)

test_categorical_cols_removed = order_test_cols(train_categorical_cols_removed, test_categorical_cols_removed)
# save to csv
train_categorical_cols_removed.to_csv('../data/processed/train_categorical_no_missing_vals_cols_removed.csv', index=False)
test_categorical_cols_removed.to_csv('../data/processed/test_categorical_no_missing_vals_cols_removed.csv', index=False)
print(train_categorical_cols_removed.shape)
print(test_categorical_cols_removed.shape)

In [None]:
# Categorical Mean Imputed
train_categorical_imputed, test_categorical_imputed = impute_categoricals_with_mode(train_categorical, test_categorical, categorical_cols)
train_categorical_imputed, test_categorical_imputed = impute_with_mean(train_categorical_imputed, test_categorical_imputed, categorical_cols)

test_categorical_imputed = order_test_cols(train_categorical_imputed, test_categorical_imputed)

# save to csv
train_categorical_imputed.to_csv('../data/processed/train_categorical_no_missing_vals_mean_imputed.csv', index=False)
test_categorical_imputed.to_csv('../data/processed/test_categorical_no_missing_vals_mean_imputed.csv', index=False)
print(train_categorical_imputed.shape)
print(test_categorical_imputed.shape)
train_categorical_imputed.head()