In [12]:
### Libraries

from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.nn as nn
import torch.optim as optim

In [13]:
### Setting Random Seeds

def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)     # python random generator
    np.random.seed(RANDOM_SEED)  # numpy random generator

    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(42)

In [22]:
### Read raw data

data_path = f'{os.path.dirname(os.getcwd())}/data'

train_raw = pd.read_csv(f'{data_path}/train.csv')
test_raw = pd.read_csv(f'{data_path}/test.csv')

In [29]:
### Clean Property Information

def clean(df: pd.DataFrame):
    # Drop NaNs from beds and bathrooms_text columns
    df.dropna(subset = ['beds', 'bathrooms_text'], inplace = True)

    # Group hotel and shared rooms into 'other' category
    rooms_regrouped = df['room_type'].where((df['room_type'] == 'Entire home/apt') | (df['room_type'] == 'Private room'), 'Other')
    df['rooms_regrouped'] = rooms_regrouped
    df['entire_bin'] = np.where(df['rooms_regrouped'] == 'Entire home/apt', 1, 0)
    df['private_bin'] = np.where(df['rooms_regrouped'] == 'Private room', 1, 0)
    df['other_room_bin'] = np.where(df['rooms_regrouped'] == 'Other', 1, 0)

    # Extract 'shared' keyword from bathrooms_text column
    def shared_bathrooms(row):
        if type(row['bathrooms_text']) is not str or 'shared' not in row['bathrooms_text']:
            return 0
        return 1
    df['bathrooms_shared'] = df.apply(shared_bathrooms, axis = 1)

    # Extract number of baths from bathrooms_text column
    def extract_num(row):
        char_arr = np.array(row['bathrooms_text'].split())
        res = char_arr[np.char.isnumeric(char_arr)].astype(float)
        return res[0] if res.size != 0 else 0.5 # This is very specific but works for the bathrooms feature, NEEDS FIXING BEFORE APPLYING TO TEST SET
    df['bathrooms_num'] = df.apply(extract_num, axis = 1)

    # Extract number of amenities from amenities column
    def extract_amenities(row):
        return set(row['amenities'][2:-2].split('''", "'''))
    df['amenities_ref'] = df.apply(extract_amenities, axis = 1)
    def count_amenities(row):
        return len(row['amenities_ref'])
    df['amenities_count'] = df.apply(count_amenities, axis = 1)


In [24]:
### Drop non-important features

important_features = ['host_listings_count', 'calculated_host_listings_count_private_rooms', 'entire_bin', 'private_bin', 'other_room_bin', 'bathrooms_shared', 'bathrooms_num', 'amenities_count', 'beds', 'accommodates', 'longitude', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'availability_30', 'availability_365']
train_df, test_df = clean(train_raw), clean(test_raw)

In [None]:
class AirbnbClassifier(nn.module)