# Location Check-in Dataset

* Dingqui YANG 블로그의 Foursquare Check-in dataset. 2012~2013년 사이 약 18개월간의 전 세계 Check-in dataset. 33,278,683번의 check-in과 266,909의 유저, 3,680,126의 장소들로 이루어져 있음 (77개 국가의 415개의 도시)

## 데이터 처리

### 파일로부터 데이터 읽기
* Cities_data : 도시명, 국가 이름, 좌표 등의 정보
* Checkins_data : 유저-장소 방문 기록
* POI_data : 각 장소에 대한 ID, 좌표, 카테고리 정보

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
Cities_data = pd.read_csv('D:\dataset_TIST2015\Data\dataset_TIST2015_Cities.txt', sep='\t', lineterminator='\n', names = ['city name', 'Latitude', 'Longitude', 'Country Code', 'Country name', 'City type'])
Checkins_data = pd.read_csv('D:\dataset_TIST2015\Data\dataset_TIST2015_Checkins.txt', sep='\t', lineterminator='\n', names = ['User ID', 'Venue ID', 'UTC Time', 'Timezone Offset'])
POI_data = pd.read_csv('D:\dataset_TIST2015\Data\dataset_TIST2015_POIs.txt', sep='\t', lineterminator='\n', names = ['Venue ID', 'Latitude', 'Longitude', 'Venue category', 'Country Code'])

In [3]:
#Cities_data.head()

In [4]:
#Checkins_data.head()

In [5]:
#POI_data.head()

* POI data로부터, country code가 KR인 것들만 추출

In [6]:
POI_data_KR = POI_data.loc[POI_data['Country Code'] == 'KR']

In [7]:
#POI_data_KR.head()

* 장소 카테고리 종류 출력

In [8]:
#POI_data_KR['Venue category'].unique()

* Checkins_data로부터 한국에 있는 장소들의 ID만 추출

In [9]:
Checkins_data_KR = Checkins_data[Checkins_data['Venue ID'].isin(POI_data_KR['Venue ID'].tolist())]
Checkins_data_KR.head(5)

Unnamed: 0,User ID,Venue ID,UTC Time,Timezone Offset
2719,66388,4edf00ad6da10302870475f7,Tue Apr 03 18:34:05 +0000 2012,540
5358,48332,4e6cb240e4cd4bedebb992f4,Tue Apr 03 19:11:43 +0000 2012,540
11293,141697,4cf7a6231cfea0939be9e539,Tue Apr 03 20:45:29 +0000 2012,540
11995,89193,4e5414fb1f6e850d277ec012,Tue Apr 03 20:54:55 +0000 2012,540
12436,317,4b9b0d50f964a520f1ee35e3,Tue Apr 03 21:01:02 +0000 2012,540


In [10]:
Checkins_data_KR.groupby('User ID').count()

Unnamed: 0_level_0,Venue ID,UTC Time,Timezone Offset
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47,19,19,19
54,1,1,1
55,8,8,8
56,3,3,3
66,1,1,1
...,...,...,...
266092,22,22,22
266298,72,72,72
266514,75,75,75
266624,31,31,31


In [11]:
#Checkins_data_KR.loc[Checkins_data_KR['User ID'] == 9889]
#Checkins_data_KR.loc[Checkins_data_KR['Venue ID'] == '4b058781f964a520659622e3']

In [12]:
#POI_data_KR.loc[POI_data_KR['Venue ID'] == '4b058781f964a520659622e3']

* Checkins_data_KR에서 User ID를 row로, Venue ID를 column으로, 방문 횟수가 Dataframe의 값이 되는 Dataframe Checkins_table생성

In [13]:
Checkins_table = pd.DataFrame({'UserID': Checkins_data_KR['User ID'], 'VenueID': Checkins_data_KR['Venue ID']})
Checkins_table = Checkins_table.VenueID.groupby([Checkins_table.UserID, Checkins_table.VenueID]).size().unstack().fillna(0).astype(int)
Checkins_table = (Checkins_table>0).astype(int)
Checkins_table_rows = Checkins_table.index.values.tolist()
Checkins_table_columns = Checkins_table.columns.tolist()
Checkins_table_numpy = Checkins_table.to_numpy()
unique_Checkins = np.unique(Checkins_table)

In [14]:
#메모리 부족으로 필요없는 변수 제거
Checkins_data = None

In [15]:
from typing import List

def wide_to_long(wide: np.array, possible_ratings: List[int]) -> np.array:
    def _get_ratings(arr: np.array, rating: int) -> np.array:
        idx = np.where(arr == rating)
        return np.vstack(
            (idx[0], idx[1], np.ones(idx[0].size, dtype="int8") * rating)
        ).T

    long_arrays = []
    for r in possible_ratings:
        long_arrays.append(_get_ratings(wide, r))

    return np.vstack(long_arrays)

In [16]:
long_train = wide_to_long(Checkins_table_numpy, unique_Checkins)
df_train = pd.DataFrame(long_train, columns=["user_id", "item_id", "interaction"])

In [17]:
df_train

Unnamed: 0,user_id,item_id,interaction
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0
...,...,...,...
197738163,3864,45110,1
197738164,3864,45270,1
197738165,3864,46242,1
197738166,3865,862,1


In [18]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2


def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 32,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [8, 4],
    reg_layers: List[int] = [0.01, 0.01],
    activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int32", name="user_id")
    item = Input(shape=(), dtype="int32", name="item_id")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="interaction"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model

In [19]:
from tensorflow.keras.optimizers import Adam

n_users, n_items = Checkins_table.shape
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)
ncf_model._name = "neural_collaborative_filtering"
ncf_model.summary()

Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
mlp_user_embedding (Embedding)  (None, 32)           123712      user_id[0][0]                    
__________________________________________________________________________________________________
mlp_item_embedding (Embedding)  (None, 32)           1636736     item_id[0][0]                    
_____________________________________________________________________

In [20]:
def make_tf_dataset(
    df: pd.DataFrame,
    targets: List[str],
    val_split: float = 0.1,
    batch_size: int = 512,
    seed=42,
):
    """Make TensorFlow dataset from Pandas DataFrame.
    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling data - `None` won't shuffle the data"""

    n_val = round(df.shape[0] * val_split)
    if seed:
        # shuffle all the rows
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))

    ds_val = ds.take(n_val).batch(batch_size)
    ds_train = ds.skip(n_val).batch(batch_size)
    return ds_train, ds_val

In [22]:
df_train = df_train.astype("int16")

In [23]:
# create train and validation datasets
ds_train, ds_val = make_tf_dataset(df_train, ["interaction"])

MemoryError: Unable to allocate 1.47 GiB for an array with shape (197738168,) and data type int64