In [2]:
import pandas as pd

def set_parser(data):
  try:
    j1 = len(data.strip('*}*{').split(','))
    return j1
  except:
    return 'problem'

df = pd.read_csv('train.csv', converters={'amenities':set_parser}, header=0)

y = df[['price']]
y.columns

amenity_vals = [df['amenities']]
print(amenity_vals)

[0        33
1        19
2        22
3        34
4        11
         ..
23472    25
23473    23
23474    16
23475    10
23476    19
Name: amenities, Length: 23477, dtype: int64]


In [3]:
import json
from math import isnan

def to_float(s):
    try:
        ret = float(s)
    except:
        ret = -1
    if isnan(ret):
        ret = -1
    return ret

def to_float_avg(s, name_of_col, data_frame):
    avg = data_frame[name_of_col].mean()
    try:
        ret = float(s)
    except:
        ret = avg
    if isnan(ret):
        ret = avg
    return ret

def to_int(s):
    try:
        ret = int(s)
    except:
        ret = -1
    return ret

def to_int_avg(s, name_of_col, data_frame):
    avg = data_frame[name_of_col].mean()
    try:
        ret = int(s)
    except:
        ret = avg
    return ret

def json_parser(data):
    j1 = json.loads(data)
    return j1

categories = [x for x in list(set(df['neighbourhood_group_cleansed'])) if type(x) is str]
state_categories = [x for x in list(set(df['state'])) if type(x) is str]
state_categories = [x for x in list(set(df['state'])) if type(x) is str]
room_categories = [x for x in list(set(df['room_type'])) if type(x) is str]
super_host_categories = [x for x in list(set(df['host_is_superhost'])) if type(x) is str]
bed_type_categories = [x for x in list(set(df['bed_type'])) if type(x) is str]
cancellation_policy_categories = [x for x in list(set(df['cancellation_policy'])) if type(x) is str]
property_type_categories = [x for x in list(set(df['property_type'])) if type(x) is str]
instant_bookable_categories = [x for x in list(set(df['instant_bookable'])) if type(x) is str]
instant_bookable_categories = [x for x in list(set(df['instant_bookable'])) if type(x) is str]
is_business_travel_categories = [x for x in list(set(df['is_business_travel_ready'])) if type(x) is str]
host_verification_categories = [x for x in list(set(df['host_identity_verified'])) if type(x) is str]
print(len(categories))

def create_feature(row, data_frame):
    guest_num = to_int(row.guests_included)
    guest_cap = to_float(row.extra_people)
    bedrooms = to_int(row.bedrooms)
    bathrooms = to_float(row.bathrooms)
    beds = to_int(row.beds)
    review = to_float(row.review_scores_rating)
    review_scores_location = to_float(row.review_scores_location)
    cleanliness_review = to_int(row.review_scores_cleanliness)
    number_of_reviews = to_int(row.number_of_reviews)
    host_listings = to_int(row.calculated_host_listings_count)
    accommodates = to_int(row.accommodates)
    amenities = to_int(row.amenities)
    minimum_nights = to_int(row.minimum_nights)
    reviews_per_month = to_float(row.reviews_per_month)
    reviews_value = to_float(row.review_scores_value)
    review_accuracy = to_float(row.review_scores_accuracy)
    review_checkin = to_float(row.review_scores_checkin)
    review_communication = to_float(row.review_scores_communication)
    one_hot = [int(row.neighbourhood_group_cleansed == category) for category in categories]
    two_hot = [int(row.state == state_category) for state_category in state_categories]
    three_hot = [int(row.room_type == room_category) for room_category in room_categories]
    four_hot = [int(row.host_is_superhost == super_host_category) for super_host_category in super_host_categories]
    five_hot = [int(row.bed_type == bed_type_category) for bed_type_category in bed_type_categories]
    six_hot = [int(row.cancellation_policy == cancelation_policy_category) for cancelation_policy_category in cancellation_policy_categories]
    seven_hot = [int(row.property_type == property_type_category) for property_type_category in property_type_categories]
    eight_hot = [int(row.instant_bookable == instant_bookable_category) for instant_bookable_category in instant_bookable_categories]
    nine_hot = [int(row.is_business_travel_ready == is_business_travel_category) for is_business_travel_category in is_business_travel_categories]
    ten_hot = [int(row.host_identity_verified == host_identity_verified_category) for host_identity_verified_category in host_verification_categories]
    
    return [
        guest_num, 
        guest_cap, 
        bathrooms,
        accommodates, 
        beds, 
        minimum_nights,
        bedrooms,
        review, 
        number_of_reviews, 
        host_listings,
        review_scores_location,
        reviews_per_month, 
        reviews_value, 
        cleanliness_review,
        review_accuracy,
        review_checkin,
        amenities
        ] + one_hot + three_hot + four_hot + seven_hot + eight_hot + nine_hot + ten_hot


train_X, train_y = [], []

for (idx, row) in df.iterrows():
    price = to_float(row.price)
    feature = create_feature(row, df)
    #if row.accommodates == -1:
    #    continue
    #elif row.room_type == -1:
    #    continue

    train_X.append(feature)
    train_y.append(price)

print(len(train_X), len(train_y))

5
23477 23477


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score
import numpy as np

#enc = OneHotEncoder()
#print(amenity_vals)
#enc.fit(amenity_vals)
#amenity_encoded = enc.transfrom(amenity_vals)
#print(amenity_encoded.to_array())

In [5]:

# This is for finding best alpha value
# 1.5 seems to a be a good value
import matplotlib.pyplot as plt

import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=3072)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

2022-04-13 17:15:51.871759: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


1 Physical GPUs, 1 Logical GPUs


2022-04-13 17:15:52.461588: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-13 17:15:52.462035: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-04-13 17:15:52.484340: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-13 17:15:52.484421: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.65GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2022-04-13 17:15:52.484438: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-04-13 17:15:52.485387: I tensorflow/stream_executor/platform

In [6]:

''' Creating the neural network '''
optimizer = tf.keras.optimizers.Adam(epsilon=0.00005)
def nn():
  nn = tf.keras.models.Sequential()
  nn.add(tf.keras.layers.Dense(400, activation='relu'))
  nn.add(tf.keras.layers.Dense(320, activation='relu'))
  nn.add(tf.keras.layers.Dense(280, activation='relu'))
  nn.add(tf.keras.layers.Dense(200, activation='relu'))
  nn.add(tf.keras.layers.Dense(150, activation='relu'))
  nn.add(tf.keras.layers.Dense(80, activation='relu'))
  nn.add(tf.keras.layers.Dense(30, activation='relu'))
  nn.add(tf.keras.layers.Dense(1))
  nn.compile(optimizer=optimizer, loss=tf.keras.losses.MeanAbsoluteError())
  return nn
  
nn_regr = tf.keras.wrappers.scikit_learn.KerasRegressor(build_fn=nn, epochs=200, batch_size=4096, verbose=False)



In [7]:

''' The lower the score, the better result '''
print(np.mean(cross_val_score(nn_regr, train_X, train_y, cv=3, n_jobs=1)))

2022-04-13 17:15:54.229082: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-04-13 17:15:54.244327: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3601000000 Hz
2022-04-13 17:15:54.560268: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


-45.205248514811196


In [8]:

nn_regr.fit(train_X, train_y)

<tensorflow.python.keras.callbacks.History at 0x7f6047bf92b0>

In [1]:
test_df = pd.read_csv('test.csv')
test_ids, test_X = [], []
for (idx, row) in test_df.iterrows():
  feature = create_feature(row, test_df)
  test_ids.append(row.id)
  test_X.append(feature)
test_y = nn_regr.predict(test_X)

print(test_y)

output_df = pd.DataFrame()
output_df['Id'] = test_ids
output_df['Predicted'] = test_y
output_df.to_csv('mlpregressor_prediction.csv', index=False)



NameError: name 'pd' is not defined

In [None]:

''' Creating the neural network '''
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
optimizer = tf.keras.optimizers.Adam()
def nn():
  nn = tf.keras.models.Sequential()
  nn.add(tf.keras.layers.Dense(180, activation='relu'))
  nn.add(tf.keras.layers.Dense(80, activation='relu'))
  nn.add(tf.keras.layers.Dense(1))
  nn.compile(optimizer=optimizer, loss=tf.keras.losses.MeanAbsoluteError())
  return nn

