In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import TargetEncoder
import networkx as nx

from utils_task2 import * # Functions written by team ghostbusters
pd.options.display.float_format = '{:,.2f}'.format # float display formatting

In [35]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<h1>User profile generation</h1>
This notebook contains code to (1) Utilize task 1 data for task 2. (2) Generate user profile similating real searching scanerios.

<h2> Utilize task 1 data for task 2</h2>

In [51]:
#   Use preprocessed data for Task 1
df_train = pd.read_csv('./train_final_complete_nodrop.csv') # preprocessed data without dropping columns

<h2> Create viewing history for user </h2>

In [37]:
# Set parameters
num_viewed_ppt = 15 # Number of view history for each user
time_scale = 5 # The viewing time of user range from 0~5, which serves as a ranking

In [38]:
df_train.columns

Index(['listing_id', 'title', 'address', 'property_name', 'property_type',
       'tenure', 'built_year', 'num_beds', 'num_baths', 'size_sqft',
       'floor_level', 'furnishing', 'available_unit_types', 'total_num_units',
       'property_details_url', 'lat', 'lng', 'elevation', 'subzone',
       'planning_area', 'price', 'general_property_type',
       'property_type_ordinal', 'tenure_group', 'bed2bath', 'per_price',
       'name_of_nearest_CR', 'name_of_nearest_IHL', 'name_of_nearest_BN',
       'name_of_nearest_IHL_ordinal', 'name_of_nearest_BN_ordinal',
       'name_of_nearest_CR_ordinal', 'dist_to_nearest_important_mrt',
       'dist_to_nearest_important_mrt_rounded',
       'number_of_nearby_shopping_malls', 'number_of_nearby_primary_schools',
       'number_of_nearby_secondary_schools', 'subzone_per_price_encoded'],
      dtype='object')

In [39]:
# Include name of nearest mrt as a search criteria
df_mrt = pd.read_csv('./sg-mrt-stations.csv')
df = add_info_of_nearest_mrt(df_train,df_mrt)

In [40]:

"""Features selected based on observation from 99.co"""
#   range_based: size_sqft, price_range, built_year, per_price_range
#   Select multiple close numbers:num_beds, num_baths
#   Select random multiple: mrt,   subzone,    property_type, tenure
mandatory_features = ['num_beds','price','property_type']
locational_features = ['planning_area','subzone','None']
optional_features = ['None','name_of_nearest_mrt','num_baths','tenure','built_year','size_sqft','per_price']

num_beds = df.num_beds.unique()
price_range = df.price.unique()
ppt_types = df.property_type.unique()

planning_area = df.planning_area.unique()
subzone = df.subzone.unique()
mrt = df.name_of_nearest_mrt.unique()
num_baths = df.num_baths.unique()

In [41]:
# Function for randomly choosing filters for selected search criterion attributes.
def search_features(features,df):
    res = df.copy()
    feat_dicts =[]
    for feat in features:
        if feat == 'size_sqft' or feat == 'built_year' or feat =='price' or feat == 'per_price':
            minthres,maxthres = define_range(df[feat]) # range based, need minimum threshold and maximum threshold
            res = res[(res[feat]>=minthres) & (res[feat]<=maxthres) ]
            dic = feat+':'+str(minthres)+','+str(maxthres)+';'
            feat_dicts.append(dic)
        elif feat == 'num_beds' or feat == 'num_baths':
            choices = define_close_num(df[feat]) # Constrained select close discrete values(e.g. search 2beds & 3beds, search 6beds&7beds&8beds)
            res = res[res[feat].isin(choices)]
            dic = feat+':'+','.join(str(x) for x in choices)+';'
            feat_dicts.append(dic)
        else:
            choices = define_select_multiple(df[feat]) # Randomly select multiple values without constraints
            res = res[res[feat].isin(choices)]
            dic = feat+':'+','.join(str(x) for x in choices) +';'
            feat_dicts.append(dic)
    return res, feat_dicts


In [42]:
# Generate  feature for each subtype
def gen_features(mandatory_features,locational_features,optional_features):
    selected = []
    selected.extend(get_search_features(mandatory_features,mandatory = True))
    selected.extend(get_search_features(locational_features,single=True))
    selected.extend(get_search_features(optional_features))
    return selected

In [43]:

# Generate single user profile
def gen_single_profile(mandatory_features,locational_features,optional_features,df):
    features =gen_features(mandatory_features,locational_features,optional_features)
    return search_features(features,df)

# Generate multiple user profiles
def gen_multiple_profiles(mandatory_features,locational_features,optional_features,df,num_of_profile):
    profiles = []
    search_criterions = []
    view_times = []
    while len(profiles) < num_of_profile:
        prof,search_info= gen_single_profile(mandatory_features,locational_features,optional_features,df)
        if(len(prof)) < num_viewed_ppt:
            continue
        rand_id = np.random.choice(len(prof),num_viewed_ppt)
        rand_property = prof.iloc[rand_id]
        profiles.append(rand_property.listing_id)
        search_criterions.append(" ".join(str(x) for x in search_info))
        view_time = np.random.choice(np.arange(1,11),num_viewed_ppt)
        view_times.append(view_time)
    return np.array(profiles),search_criterions,np.array(view_times)

In [44]:
# Generate final user profile
profile_number = 500 # 500 users
profiles,criterions,view_times = gen_multiple_profiles(mandatory_features,locational_features,optional_features,df,profile_number)

In [45]:
splits = criterions[0].split(';')
print(len(splits))
print(splits[:-1])
def critetion_str_to_dataframe(criterion):
    splits = criterion.split(';')[:-1]
    y = {'num_beds':np.nan,'price':np.nan,'property_type':np.nan,'planning_area':np.nan,'subzone':np.nan,'name_of_nearest_mrt':np.nan,'num_baths':np.nan,'tenure':np.nan,'built_year':np.nan,'size_sqft':np.nan,'per_price':np.nan} 
    for cri in splits:
        feature_name = cri.split(':')[0].lstrip()
        y[feature_name] = cri.split(':')[1]
    return y

5
['num_beds:2.0,3.0,4.0', ' price:0,9044712', ' property_type:condo', ' subzone:kebun bahru,maritime square,yio chu kang west']


In [46]:
final_df = []
for i in range(profile_number):
    d = {'profile_id': i, 'listing_id': profiles[i], 'view_time': view_times[i]}
    search_cris = critetion_str_to_dataframe(criterions[i])
    s = pd.DataFrame(search_cris, index=[0])
    s = pd.concat([s]*num_viewed_ppt, axis=0)
    s = s.reset_index()
    df = pd.DataFrame(data=d)
    df = pd.concat([df,s], axis=1)
    final_df.append(df)

result = pd.concat(final_df)
result.head(30)
#

Unnamed: 0,profile_id,listing_id,view_time,index,num_beds,price,property_type,planning_area,subzone,name_of_nearest_mrt,num_baths,tenure,built_year,size_sqft,per_price
0,0,677525,4,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
1,0,978007,5,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
2,0,727332,9,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
3,0,737066,6,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
4,0,493857,5,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
5,0,456472,9,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
6,0,463714,8,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
7,0,677525,7,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
8,0,276697,10,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,
9,0,342846,9,0,"2.0,3.0,4.0",9044712,condo,,"kebun bahru,maritime square,yio chu kang west",,,,,,


In [47]:
result.to_csv('./user_profile.csv',index = False)

<h3> Clean data </h3>
Categorical data should be one-hot encoded for measures cosine similarities

In [57]:
# One hot encoding property type
s = pd.Series(df_train.property_type)
d = pd.get_dummies(s).rename(columns=lambda x:'property_type_' +str(x.replace(" ","_").replace("-","_")))
df_train = pd.concat([df_train,d], axis=1)
d.head()

Unnamed: 0,property_type_apartment,property_type_bungalow,property_type_cluster_house,property_type_condo,property_type_conservation_house,property_type_corner_terrace,property_type_executive_condo,property_type_hdb,property_type_land_only,property_type_landed,property_type_semi_detached_house,property_type_shophouse,property_type_terraced_house,property_type_townhouse,property_type_walk_up
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [58]:
# One hot encoding tenure
s = pd.Series(df_train.tenure_group)
d = pd.get_dummies(s).set_axis(['Nan', 'freehold', '99-110 year', '999+year'], axis=1, inplace=False)
d = d.rename(columns=lambda x:'tenure_' +str(x.lower().replace(" ","_")))
df_train = pd.concat([df_train,d], axis=1)
d.head()

Unnamed: 0,tenure_nan,tenure_freehold,tenure_99-110_year,tenure_999+year
0,1,0,0,0
1,0,1,0,0
2,0,0,0,1
3,0,0,0,1
4,0,1,0,0


In [54]:
kept_features =['listing_id','num_beds', 'num_baths', 
            'lat', 'lng', # easy to find properties with the same property name
            'property_type_apartment', 'property_type_bungalow', 'property_type_cluster_house', 'property_type_condo',	'property_type_conservation_house', 'property_type_corner_terrace', 'property_type_executive_condo', 'property_type_hdb',	'property_type_land_only', 'property_type_landed', 'property_type_semi_detached_house', 'property_type_shophouse',	'property_type_terraced_house', 'property_type_townhouse', 'property_type_walk_up', 
            'tenure_nan', 'tenure_freehold',	'tenure_99-110_year', 'tenure_999+year',
            'size_sqft', 'dist_to_nearest_important_mrt_rounded', 'price', 'built_year','property_type','tenure']

In [59]:
df_final = df_train[kept_features]

In [60]:
df_final.isna().sum()

listing_id                                  0
num_beds                                    0
num_baths                                   0
lat                                         0
lng                                         0
property_type_apartment                     0
property_type_bungalow                      0
property_type_cluster_house                 0
property_type_condo                         0
property_type_conservation_house            0
property_type_corner_terrace                0
property_type_executive_condo               0
property_type_hdb                           0
property_type_land_only                     0
property_type_landed                        0
property_type_semi_detached_house           0
property_type_shophouse                     0
property_type_terraced_house                0
property_type_townhouse                     0
property_type_walk_up                       0
tenure_nan                                  0
tenure_freehold                   

In [62]:
df_final.to_csv('./df_task2_onehot.csv',index=False)