In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import TargetEncoder
import networkx as nx

from utils_task2 import * # Functions written by team ghostbusters
pd.options.display.float_format = '{:,.2f}'.format # float display formatting

In [4]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<h1>User profile generation</h1>
This notebook contains code to (1) Utilize task 1 data for task 2. (2) Generate user profile similating real searching scanerios.

<h2> Utilize task 1 data for task 2</h2>

In [22]:
#   Use preprocessed data for Task 1
df = pd.read_csv('../data/train_final_complete_nodrop.csv') # preprocessed data without dropping columns
pre_train = pd.read_csv('../data/train.csv') # raw data
print("number of na in built_year raw data:", pre_train.built_year.isna().sum())

number of na in built_year raw data: 922


In [23]:
def reset_built_year(r,df_na):
    if np.isin(r.listing_id,(df_na.listing_id)):
        return np.nan
    else:
        return r.built_year

In [26]:
df_na = pre_train[pre_train.built_year.isna()]
df['built_year'] = df.apply(lambda r: reset_built_year(r,df_na),axis = 1)
print("number of na in built_year processed data:", df.built_year.isna().sum())

number of na in built_year processed data: 920


<h2> Create viewing history for user </h2>

In [27]:
# Set parameters
num_viewed_ppt = 15 # Number of view history for each user
time_scale = 5 # The viewing time of user range from 0~5, which serves as a ranking

In [28]:
df.columns

Index(['listing_id', 'title', 'address', 'property_name', 'property_type',
       'tenure', 'built_year', 'num_beds', 'num_baths', 'size_sqft',
       'floor_level', 'furnishing', 'available_unit_types', 'total_num_units',
       'property_details_url', 'lat', 'lng', 'elevation', 'subzone',
       'planning_area', 'price', 'general_property_type',
       'property_type_ordinal', 'tenure_group', 'bed2bath', 'per_price',
       'name_of_nearest_CR', 'name_of_nearest_IHL', 'name_of_nearest_BN',
       'name_of_nearest_IHL_ordinal', 'name_of_nearest_BN_ordinal',
       'name_of_nearest_CR_ordinal', 'dist_to_nearest_important_mrt',
       'dist_to_nearest_important_mrt_rounded',
       'number_of_nearby_shopping_malls', 'number_of_nearby_primary_schools',
       'number_of_nearby_secondary_schools', 'subzone_per_price_encoded'],
      dtype='object')

In [31]:
# Include name of nearest mrt as a search criteria
df_mrt = pd.read_csv('/Users/kexinzheng/Documents/class/5228/project/data/auxiliary-data/sg-mrt-stations.csv')
df = add_info_of_nearest_mrt(df,df_mrt)

In [32]:

"""Features selected based on observation from 99.co"""
#   range_based: size_sqft, price_range, built_year, per_price_range
#   Select multiple close numbers:num_beds, num_baths
#   Select random multiple: mrt,   subzone,    property_type, tenure
mandatory_features = ['num_beds','price','property_type']
locational_features = ['planning_area','subzone','None']
optional_features = ['None','name_of_nearest_mrt','num_baths','tenure','built_year','size_sqft','per_price']

num_beds = df.num_beds.unique()
price_range = df.price.unique()
ppt_types = df.property_type.unique()

planning_area = df.planning_area.unique()
subzone = df.subzone.unique()
mrt = df.name_of_nearest_mrt.unique()
num_baths = df.num_baths.unique()

In [33]:
# Function for randomly choosing filters for selected search criterion attributes.
def search_features(features,df):
    res = df.copy()
    feat_dicts =[]
    for feat in features:
        if feat == 'size_sqft' or feat == 'built_year' or feat =='price' or feat == 'per_price':
            minthres,maxthres = define_range(df[feat]) # range based, need minimum threshold and maximum threshold
            res = res[(res[feat]>=minthres) & (res[feat]<=maxthres) ]
            dic = feat+':'+str(minthres)+','+str(maxthres)+';'
            feat_dicts.append(dic)
        elif feat == 'num_beds' or feat == 'num_baths':
            choices = define_close_num(df[feat]) # Constrained select close discrete values(e.g. search 2beds & 3beds, search 6beds&7beds&8beds)
            res = res[res[feat].isin(choices)]
            dic = feat+':'+','.join(str(x) for x in choices)+';'
            feat_dicts.append(dic)
        else:
            choices = define_select_multiple(df[feat]) # Randomly select multiple values without constraints
            res = res[res[feat].isin(choices)]
            dic = feat+':'+','.join(str(x) for x in choices) +';'
            feat_dicts.append(dic)
    return res, feat_dicts


In [34]:
# Generate  feature for each subtype
def gen_features(mandatory_features,locational_features,optional_features):
    selected = []
    selected.extend(get_search_features(mandatory_features,mandatory = True))
    selected.extend(get_search_features(locational_features,single=True))
    selected.extend(get_search_features(optional_features))
    return selected

In [35]:

# Generate single user profile
def gen_single_profile(mandatory_features,locational_features,optional_features,df):
    features =gen_features(mandatory_features,locational_features,optional_features)
    return search_features(features,df)

# Generate multiple user profiles
def gen_multiple_profiles(mandatory_features,locational_features,optional_features,df,num_of_profile):
    profiles = []
    search_criterions = []
    view_times = []
    while len(profiles) < num_of_profile:
        prof,search_info= gen_single_profile(mandatory_features,locational_features,optional_features,df)
        if(len(prof)) < num_viewed_ppt:
            continue
        rand_id = np.random.choice(len(prof),num_viewed_ppt)
        rand_property = prof.iloc[rand_id]
        profiles.append(rand_property.listing_id)
        search_criterions.append(" ".join(str(x) for x in search_info))
        view_time = np.random.choice(np.arange(1,11),num_viewed_ppt)
        view_times.append(view_time)
    return np.array(profiles),search_criterions,np.array(view_times)

In [37]:
# Generate final user profile
profile_number = 500 # 500 users
profiles,criterions,view_times = gen_multiple_profiles(mandatory_features,locational_features,optional_features,df,profile_number)

In [39]:
splits = criterions[0].split(';')
print(len(splits))
print(splits[:-1])
def critetion_str_to_dataframe(criterion):
    splits = criterion.split(';')[:-1]
    y = {'num_beds':np.nan,'price':np.nan,'property_type':np.nan,'planning_area':np.nan,'subzone':np.nan,'name_of_nearest_mrt':np.nan,'num_baths':np.nan,'tenure':np.nan,'built_year':np.nan,'size_sqft':np.nan,'per_price':np.nan} 
    for cri in splits:
        feature_name = cri.split(':')[0].lstrip()
        y[feature_name] = cri.split(':')[1]
    return y

4
['price:0,24862755', ' property_type:executive condo,condo', ' num_beds:6.0,4.0']


In [40]:
final_df = []
for i in range(profile_number):
    d = {'profile_id': i, 'listing_id': profiles[i], 'view_time': view_times[i]}
    search_cris = critetion_str_to_dataframe(criterions[i])
    s = pd.DataFrame(search_cris, index=[0])
    s = pd.concat([s]*num_viewed_ppt, axis=0)
    s = s.reset_index()
    df = pd.DataFrame(data=d)
    df = pd.concat([df,s], axis=1)
    final_df.append(df)

result = pd.concat(final_df)
result.head(30)
#

Unnamed: 0,profile_id,listing_id,view_time,index,num_beds,price,property_type,planning_area,subzone,name_of_nearest_mrt,num_baths,tenure,built_year,size_sqft,per_price
0,0,661512,7,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
1,0,813855,7,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
2,0,893999,3,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
3,0,611943,9,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
4,0,951209,1,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
5,0,107785,2,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
6,0,375083,5,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
7,0,788301,7,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
8,0,882197,1,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,
9,0,935524,5,0,"6.0,4.0",24862755,"executive condo,condo",,,,,,,,


In [212]:
result.to_csv('./user_profile.csv',index = False)