In [1]:
import numpy as np

import pandas as pd

from argparse import Namespace

import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
args = Namespace(
    save_directory="NumpyData/Apartment/",
    data_path = "./apartments_small.csv",
    train_split = 0.8,
    test_split = 0.2,
    val_split=0.25,
    seed=5059,
)

In [3]:
df = pd.read_csv(args.data_path, sep=";")

# 1 Data Exploration

In [7]:
df.shape

(1000, 22)

In [8]:
df.dtypes

id                 int64
category          object
title             object
body              object
amenities         object
bathrooms        float64
bedrooms           int64
currency          object
fee               object
has_photo         object
pets_allowed      object
price              int64
price_display     object
price_type        object
square_feet        int64
address           object
cityname          object
state             object
latitude         float64
longitude        float64
source            object
time               int64
dtype: object

In [9]:
df.iloc[0]

id                                                      5668610646
category                                    housing/rent/apartment
title                                 Three BR 128 Magazine Street
body             This unit is located at 128 Magazine Street, D...
amenities                                  Dishwasher,Refrigerator
bathrooms                                                      2.0
bedrooms                                                         3
currency                                                       USD
fee                                                             No
has_photo                                                Thumbnail
pets_allowed                                             Cats,Dogs
price                                                         1149
price_display                                               $1,149
price_type                                                 Monthly
square_feet                                                   

## 1.1 Distributions

# 2 Data Pre-processing

In [4]:
'''  
    Custom Estimators 
'''

class DropColumns(BaseEstimator, TransformerMixin):
    ''' 
        Drop : default, poutcome
    '''
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        return X_copy.drop(columns=self.columns_to_drop)

class ExplodeHotEncode(BaseEstimator, TransformerMixin):
    ''' 
        Explodes and onehot-encodes series type columns (values in series become separate binarized columns)
    '''

    def __init__(self, columns, sep=","):
        self.columns = columns
        self.sep = sep 
        self.encoders = {}
    
    def fit(self, X, y=None):
        X_copy = X.copy()
        for col in self.columns:
            token_lists = X_copy[col].astype(str).apply(lambda x: list(set(x.split(self.sep))) if not self.isNa(x) else [])

            mlb = MultiLabelBinarizer()
            mlb.fit(token_lists)
            self.encoders[col] = mlb
        return self
    
    def transform(self, X):
        X_copy = X.copy()

        for col in self.columns:
            token_lists = X_copy[col].astype(str).apply(lambda x: list(set(x.split(self.sep))) if not self.isNa(x) else [])
            mlb = self.encoders[col]
            transformed = mlb.transform(token_lists)
            df_transformed = pd.DataFrame(transformed, 
                                          columns=[f"{col}__{cls}" for cls in mlb.classes_],
                                          index=X_copy.index)
            
            X_copy = pd.concat([X_copy, df_transformed], axis=1)
            
        X_copy = X_copy.drop(columns=self.columns)

        return X_copy
    
    def isNa(self, x) :
        return x.strip() == "" or x is None or x.lower() == "nan"
    
class HandleNaN(BaseEstimator, TransformerMixin):
    ''' 
        Standardizes missing values to np.nan type
    '''
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X,):
        X_copy = X.copy()
        X_copy = X_copy.map(lambda x : x.lower().strip() if isinstance(x, str) else x)
        X_copy.replace('unknown', np.nan, inplace=True)
        X_copy.replace('nonexistent', np.nan, inplace=True)

        X_copy.ffill(inplace=True) 

        return X_copy
    

class EncodeStateToRegion(BaseEstimator, TransformerMixin):
    ''' 
        Reduces states to regions
    '''

    def __init__(self, column="state"):
        self.column = column

    def fit(self, X, y=None):


        
        return self
    
    def transform(self, X):
        X_copy = X.copy()

        D1 = ["CT", "ME", "MA", "NH", "RI", "VT"]
        D2 = ["NJ", "NY", "PA"]
        D3 = ["IL", "IN", "MI", "OH", "WI"]
        D4 = ["IA", "KS", "MN", "MO", "NE", "ND", "SD"]
        D5 = ["DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV"]
        D6 = ["AL", "KY", "MS", "TN"]
        D7 = ["AR", "LA", "OK", "TX"]
        D8 = ["AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY"]
        D9 = ["AK", "CA", "HI", "OR", "WA"]

        self.choices = ["New_England","Mid_Atlantic","E_N_Central","W_N_Central","South_Atlantic",
            "E_S_Central","W_S_Central","Mountain","Pacific"]

        conditions = [
                    (X[self.column].isin(D1)),
                    (X[self.column].isin(D2)),
                    (X[self.column].isin(D3)),
                    (X[self.column].isin(D4)),
                    (X[self.column].isin(D5)),
                    (X[self.column].isin(D6)),
                    (X[self.column].isin(D7)),
                    (X[self.column].isin(D8)),
                    (X[self.column].isin(D9)),
                ]

        X_copy['region'] = np.select(conditions, self.choices)
        X_copy = X_copy.drop(columns=self.column)

        return X_copy
    

    
class EncodeCityPrice(BaseEstimator, TransformerMixin):
    ''' 
        For encoding cities into city price categories
        2 : high
        1 : med
        0 : low
    '''
    def __init__(self):
        pass

    def fit(self, X, y=None):
        cities = X.groupby(['cityname'])[['price']].mean()

        # make a judgement of where to split into low medium high price cities
        self.high_cities = cities[cities['price']>2000].index
        self.low_cities = cities[cities['price']<1000].index
        # I'm going to say below <1000 is low and >2000 is high, 
        # you could use any other split you think is a good idea
        

        return self
    
    def transform(self, X):
        X_copy = X.copy()
        

        city_conditions = [
            (X_copy['cityname'].isin(self.high_cities)),
            (X_copy['cityname'].isin(self.low_cities)),
        ]
        city_choices = [2, 0]
        
        

        X_copy['cityprice'] = np.select(city_conditions, city_choices, default=1)

        X_copy = X_copy.drop(columns=["cityname", "price"])

        return X_copy   


class Binarize(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # Convert all instances of "yes"/"success" and "no"/"failure" to numbers 1 and 0
        X_copy = X_copy.map(lambda x: 1 if x == 'yes' or x=='success' else (0 if x == 'no' or x=='failure'else x))

        return X_copy

In [6]:
numerical_columns = ['id','bathrooms','bedrooms','square_feet','latitude','longitude','time']
categorical_columns = ['title','body','currency','price_display','address', 'pets_allowed','category','amenities','cityname','price']
onehot_columns = ['has_photo', 'price_type', 'source', 'state']

numerical_pipeline = Pipeline([
    ('drop', DropColumns(columns_to_drop=['id'])),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('drop', DropColumns(columns_to_drop=['title','body','currency','price_display','address'])),
    ('binarize', Binarize()),
    ("handleNaN", HandleNaN()),
    ('explodeHotEncodeComma', ExplodeHotEncode(['amenities', 'pets_allowed'], ',')),
    ('explodeHotEncodeSlash', ExplodeHotEncode(['category'], '/')),
    ('encodeCityPrice', EncodeCityPrice())
])

onehot_pipeline = Pipeline([
    ("handleNaN", HandleNaN()),
    ('stateToRegion', EncodeStateToRegion()),
    ('onehot', OneHotEncoder(drop="first", handle_unknown='ignore'))
])


full_pipeline = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_columns),
    ('categorical', categorical_pipeline, categorical_columns),
    ('onehot', onehot_pipeline, onehot_columns),
])

labelScaler = StandardScaler()

In [7]:
y = df['price'].copy()
X = df.copy()

# Create Splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_split, random_state=args.seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=args.val_split, random_state=args.seed)

# Pre-process Data 
X_train_prepared = full_pipeline.fit_transform(X_train)
X_val_prepared = full_pipeline.transform(X_val)
X_test_prepared = full_pipeline.transform(X_test)

y_train_prepared = labelScaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_prepared = labelScaler.fit_transform(y_test.values.reshape(-1, 1))
y_val_prepared = labelScaler.fit_transform(y_val.values.reshape(-1, 1))




Save data to files

In [16]:
# Input data
np.save(args.save_directory + 'X_train_prepared.npy', X_train_prepared)
np.save(args.save_directory + 'X_test_prepared.npy', X_test_prepared)
np.save(args.save_directory + 'X_val_prepared.npy', X_val_prepared)

# Raw Labels
np.save(args.save_directory + 'y_train.npy', y_train)
np.save(args.save_directory + 'y_test.npy', y_test)
np.save(args.save_directory + 'y_val.npy', y_val)

# Processed Labels
np.save(args.save_directory + 'y_train_prepared.npy', y_train_prepared)
np.save(args.save_directory + 'y_test_prepared.npy', y_test_prepared)
np.save(args.save_directory + 'y_val_prepared.npy', y_val_prepared)