# Location Check-in Dataset

* Dingqui YANG 블로그의 Foursquare Check-in dataset. 2012~2013년 사이 약 18개월간의 전 세계 Check-in dataset. 33,278,683번의 check-in과 266,909의 유저, 3,680,126의 장소들로 이루어져 있음 (77개 국가의 415개의 도시)

## 데이터 처리

### 파일로부터 데이터 읽기
* Cities_data : 도시명, 국가 이름, 좌표 등의 정보
* Checkins_data : 유저-장소 방문 기록
* POI_data : 각 장소에 대한 ID, 좌표, 카테고리 정보

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
Cities_data = pd.read_csv('D:\dataset_TIST2015\Data\dataset_TIST2015_Cities.txt', sep='\t', lineterminator='\n', names = ['city name', 'Latitude', 'Longitude', 'Country Code', 'Country name', 'City type'])
Checkins_data = pd.read_csv('D:\dataset_TIST2015\Data\dataset_TIST2015_Checkins.txt', sep='\t', lineterminator='\n', names = ['User ID', 'Venue ID', 'UTC Time', 'Timezone Offset'])
POI_data = pd.read_csv('D:\dataset_TIST2015\Data\dataset_TIST2015_POIs.txt', sep='\t', lineterminator='\n', names = ['Venue ID', 'Latitude', 'Longitude', 'Venue category', 'Country Code'])

In [3]:
#Cities_data.head()

In [4]:
#Checkins_data.head()

In [5]:
#POI_data.head()

* POI data로부터, country code가 KR인 것들만 추출

In [6]:
POI_data_KR = POI_data.loc[POI_data['Country Code'] == 'KR']

* 추천받을만하지 않은 장소 카테고리 모두 제외......

In [7]:
POI_data_KR = POI_data_KR.drop(POI_data_KR[POI_data_KR['Venue category'].isin(['Caf\x1a\x1a', 'Road', 'History Museum', 'Building', 
    'Light Rail', 'Historic Site', 'Tourist Information Center', 'Other Great Outdoors', 'Park', 'Cemetery', 'Monument / Landmark', 'Art Gallery','Skating Rink', 'Casino','Racetrack', 'Mountain', 'University',
    'Subway', 'Office','Coworking Space', 'Meeting Room', 'Lake', 'Concert Hall', 'Library', 'College Library', 'Residential Building (Apartment / Condo)','Multiplex', 'Hospital', 'Government Building', 'Home (private)',
    'Bank','Caf\x1a','Bus Station','Funeral Home', 'Church', 'Movie Theater','General Entertainment', "Doctor's Office", 'High School', 'Airport Lounge','Gas Station / Garage','College Academic Building',
    'Music Venue', 'Tech Startup', 'Capitol Building','Airport Terminal', 'Other Nightlife','School','City Hall', 'Hiking Trail', 'Temple','Stadium','Indie Movie Theater','Courthouse','College Technology Building',
    'Train Station', 'Elementary School', 'Arcade', 'Gym', 'Scenic Lookout', 'Aquarium', 'Bridal Shop', 'Rest Area', 'Wings Joint', 'Spa / Massage', 'Breakfast Spot', 'College Gym', 'Airport Food Court', 'Gym Pool', 'Design Studio', 'Bus Line',
    "Dentist's Office", 'Bridge','River','Deli / Bodega', 'Island', 'Baseball Stadium','Performing Arts Venue', 'Gaming Cafe','Automotive Shop', 'Theater', 'Pool', 'Nursery School', 'Neighborhood', 'Boat or Ferry', 
    'Harbor / Marina','Beach', 'Police Station','College Science Building','College Classroom', 'College Engineering Building', 'Plaza', 'General College & University', 'Post Office', 'College Quad',
    'Bowling Alley', 'College Cafeteria', 'Medical Center', 'Gym / Fitness Center', 'Middle School','Playground', 'Conference Room', 'College Lab', 'Basketball Stadium', 'Speakeasy', 'Arts & Crafts Store',
    'Rock Club', 'Lounge', 'Baseball Field', 'Pool Hall', 'General Travel', 'Basketball Court', 'Hot Spring', 'Parking','Optical Shop','Student Center', 'College Auditorium', 'College Administrative Building', 
    'Resort', 'Hookah Bar', 'Music Store', 'Fraternity House','Flower Shop', 'Medical School', 'Indie Theater', 'Theme Park', 'Airport', 'Athletics & Sports', 'Golf Course', 'Convenience Store', 'College Theater', 
    'College History Building', 'Community College', 'Bike Shop', 'Drugstore / Pharmacy', 'College Communications Building', 'Art Museum', 'Law School','College Basketball Court', 'Furniture / Home Store', 'Shrine', 
    'College Residence Hall', 'Hostel', 'Radio Station', 'Track Stadium','Flea Market', 'Soccer Stadium', 'Music School', 'Factory','Museum', 'Auditorium', 'Hockey Arena', 'Field', 'Spiritual Center',
    'Fish Market', 'Hardware Store', 'Science Museum', "Women's Store", 'College Bookstore', 'Video Game Store', 'Military Base', 'Camera Store', 'Opera House', 'Record Shop','Garden', 'College Track',
    'Jewelry Store', 'Airport Gate','Boutique','College Arts Building', 'Motel', 'Yoga Studio', 'Lighthouse','Track', 'Video Store', 'Skate Park', 'College Stadium','Dive Bar', 'Zoo', 'Campground', 'Car Dealership',
    'College Math Building','Gourmet Shop', 'Emergency Room', 'Fire Station', 'Pier', 'Soccer Field', 'Mosque', 'Embassy / Consulate', 'Mobile Phone Shop', 'Farm', 'Antique Shop', 'Dance Studio', 'Laundry Service', 
    'Martial Arts Dojo', 'Trail','Tennis Court', 'Winery', 'Daycare', 'Food & Drink Shop','Sculpture Garden', 'Ferry', 'Bed & Breakfast', 'Dog Run', 'Travel Lounge', 'Car Wash', 'Voting Booth',
    'Board Shop', 'Planetarium', 'Sorority House', 'Non-Profit', 'Water Park', 'Trade School','Plane', 'Veterinarian', 'Travel Agency', 'Nail Salon', 'Internet Cafe', 'Housing Development', 'College Soccer Field', 'Pet Store',
    'Comedy Club','Gay Bar', 'Bagel Shop','Thrift / Vintage Store', 'Athletic & Sport','Photography Lab', 'Platform', 'College & University','Hotel Pool', 'Train', 'Outdoors & Recreation','Kids Store',
    'Theme Park Ride / Attraction', 'Cheese Shop', 'Liquor Store', 'Smoke Shop', 'Ski Area', 'Shop & Service', 'Moving Target', 'College Baseball Diamond','Taxi', 'Vineyard','Airport Tram','Professional & Other Places',
    'Lingerie Store', 'Nightlife Spot', 'Arts & Entertainment','Ski Lodge','Travel & Transport','Bike Rental / Bike Share', 'Surf Spot','Distillery', 'Tanning Salon', 'Tattoo Parlor', 'Cricket Ground', 'Animal Shelter',
    'Strip Club', 'Roof Deck', 'Trails', 'Ski Chairlift', 'Fair', 'Storage Facility', 'Rental Car Location','College Rec Center', 'Recycling Facility', 'Boarding House','Garden Center','Well', 'Stables', 'Apres Ski Bar',
    'Laboratory', 'Campaign Office','Climbing Gym', 'Assisted Living','Public Art', 'Credit Union','Financial or Legal Service', 'Hockey Field', 'Motorcycle Shop','Pet Service', 'Real Estate Office', 'City', 'Hunting Supply',
    'Tailor Shop', 'Newsstand', 'Frozen Yogurt', 'Castle'])].index)


In [8]:
POI_data_KR

Unnamed: 0,Venue ID,Latitude,Longitude,Venue category,Country Code
80613,4b058781f964a520659622e3,37.555686,127.005097,Hotel,KR
80614,4b058781f964a520689622e3,37.565280,126.980946,Hotel,KR
80615,4b058781f964a520699622e3,37.509309,127.060715,Hotel,KR
80616,4b058781f964a5206a9622e3,37.504845,127.027166,Hotel,KR
80617,4b058781f964a5206b9622e3,37.513982,127.035497,Hotel,KR
...,...,...,...,...,...
3679642,52357a4b498eedc5b7608a56,37.568444,127.086018,Korean Restaurant,KR
3679853,5236b9ec11d2b03024e2d9a0,35.212177,129.077942,Event Space,KR
3679890,5236c8c7498ea8a785215787,37.483829,126.928474,Japanese Restaurant,KR
3679920,5236da2311d211f58f2ec319,35.181935,129.072854,Korean Restaurant,KR


* 장소 카테고리 종류 출력

In [9]:
#POI_data_KR['Venue category'].unique()

* Checkins_data로부터 한국에 있는 장소들의 ID만 추출

In [10]:
Checkins_data_KR = Checkins_data[Checkins_data['Venue ID'].isin(POI_data_KR['Venue ID'].tolist())]
Checkins_data_KR

Unnamed: 0,User ID,Venue ID,UTC Time,Timezone Offset
15347,4999,4c1d96d7eac020a1149148c2,Tue Apr 03 21:40:51 +0000 2012,540
18072,56813,4cae8b3e1168a09381881b23,Tue Apr 03 22:14:34 +0000 2012,540
19504,4628,4c78e8d33badb1f704404d54,Tue Apr 03 22:30:55 +0000 2012,540
20592,4999,4c1d96d7eac020a1149148c2,Tue Apr 03 22:42:44 +0000 2012,540
20599,148509,4efdc14a6da1cd0356d0cc77,Tue Apr 03 22:42:50 +0000 2012,540
...,...,...,...,...
33240868,198586,522a718011d20dc0af8ccf60,Mon Sep 16 13:07:02 +0000 2013,540
33241301,779,4fabc1c5e4b0c3c98b30f089,Mon Sep 16 13:15:49 +0000 2013,540
33241579,46275,4e2d7600b0fbc2edac363a95,Mon Sep 16 13:20:56 +0000 2013,540
33243703,6222,5219d471498e9d3262dffa49,Mon Sep 16 14:04:38 +0000 2013,540


In [11]:
Checkins_data_KR.groupby('User ID').count()

Unnamed: 0_level_0,Venue ID,UTC Time,Timezone Offset
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47,11,11,11
54,1,1,1
55,3,3,3
56,2,2,2
109,2,2,2
...,...,...,...
266092,10,10,10
266298,12,12,12
266514,12,12,12
266624,19,19,19


In [12]:
#Checkins_data_KR.loc[Checkins_data_KR['User ID'] == 9889]
#Checkins_data_KR.loc[Checkins_data_KR['Venue ID'] == '4b058781f964a520659622e3']

In [13]:
#POI_data_KR.loc[POI_data_KR['Venue ID'] == '4b058781f964a520659622e3']

* Checkins_data_KR에서 User ID를 row로, Venue ID를 column으로, 방문 횟수가 Dataframe의 값이 되는 Dataframe Checkins_table생성

In [14]:
Checkins_table = pd.DataFrame({'UserID': Checkins_data_KR['User ID'], 'VenueID': Checkins_data_KR['Venue ID']})
Checkins_table = Checkins_table.VenueID.groupby([Checkins_table.UserID, Checkins_table.VenueID]).size().unstack().fillna(0).astype(int)
Checkins_table = (Checkins_table>0).astype(int)
Checkins_table_rows = Checkins_table.index.values.tolist()
Checkins_table_columns = Checkins_table.columns.tolist()
unique_Checkins = np.unique(Checkins_table)
Checkins_table = Checkins_table.to_numpy()

In [15]:
#메모리 부족으로 필요없는 변수 제거
#Checkins_data = None

In [16]:
from typing import List

def wide_to_long(wide: np.array, possible_ratings: List[int]) -> np.array:
    def _get_ratings(arr: np.array, rating: int) -> np.array:
        idx = np.where(arr == rating)
        return np.vstack(
            (idx[0], idx[1], np.ones(idx[0].size, dtype="int8") * rating)
        ).T

    long_arrays = []
    for r in possible_ratings:
        long_arrays.append(_get_ratings(wide, r))

    return np.vstack(long_arrays)

In [17]:
long_train = wide_to_long(Checkins_table, unique_Checkins)
df_train = pd.DataFrame(long_train, columns=["user_id", "item_id", "interaction"])

In [18]:
df_train

Unnamed: 0,user_id,item_id,interaction
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0
...,...,...,...
87582381,3060,18841,1
87582382,3060,23442,1
87582383,3060,25170,1
87582384,3060,25827,1


In [19]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2


def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 32,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [8, 4],
    reg_layers: List[int] = [0.01, 0.01],
    activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int32", name="user_id")
    item = Input(shape=(), dtype="int32", name="item_id")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="interaction"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model

In [20]:
from tensorflow.keras.optimizers import Adam

n_users, n_items = Checkins_table.shape
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)
ncf_model._name = "neural_collaborative_filtering"
ncf_model.summary()

Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
mlp_user_embedding (Embedding)  (None, 32)           97984       user_id[0][0]                    
__________________________________________________________________________________________________
mlp_item_embedding (Embedding)  (None, 32)           915296      item_id[0][0]                    
_____________________________________________________________________

In [21]:
def make_tf_dataset(
    df: pd.DataFrame,
    targets: List[str],
    val_split: float = 0.1,
    batch_size: int = 512,
    seed=42,
):
    """Make TensorFlow dataset from Pandas DataFrame.
    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling data - `None` won't shuffle the data"""

    n_val = round(df.shape[0] * val_split)
    if seed:
        # shuffle all the rows
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))

    ds_val = ds.take(n_val).batch(batch_size)
    ds_train = ds.skip(n_val).batch(batch_size)
    return ds_train, ds_val

In [22]:
# create train and validation datasets
ds_train, ds_val = make_tf_dataset(df_train, ["interaction"])