In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/Users/yuyizhou/Documents/2017Fall/CMPT741/project/data/train_rating.txt', parse_dates=True)
df = df[['user_id','business_id', 'date', 'rating']]

In [3]:
num_of_u = len(df['user_id'].unique())
num_of_b = len(df['business_id'].unique())

In [4]:
u_start = 0
b_start = u_start + num_of_u
dow_start = b_start + num_of_b
moy_start = dow_start + 7
dom_start = moy_start + 12
woy_start = dom_start + 31
doy_start = woy_start + 52

In [5]:
def convert_category_type(id, start):
    return str(start+id) + ':1'

In [6]:
def add_time(df_t):
    df = df_t.copy()
    df['date'] = pd.to_datetime(df['date'])
    df['dow'] = df['date'].dt.dayofweek
    df['doy'] = df['date'].dt.dayofyear
    df['woy'] = df['date'].dt.weekofyear
    df['moy'] = df['date'].dt.month
    df['dom'] = df['date'].dt.day    
    return df

In [7]:
def add_libFM_features(df_t):
    df = df_t.copy()
    df['uid_c'] = df['user_id'].apply(lambda x: convert_category_type(x, u_start))
    df['bid_c'] = df['business_id'].apply(lambda x: convert_category_type(x, b_start))
    df['dow_c'] = df['dow'].apply(lambda x: convert_category_type(x, dow_start))
    df['moy_c'] = df['moy'].apply(lambda x: convert_category_type(x, moy_start))
    df['dom_c'] = df['dom'].apply(lambda x: convert_category_type(x, dom_start))
    df['woy_c'] = df['woy'].apply(lambda x: convert_category_type(x, woy_start))
    df['doy_c'] = df['doy'].apply(lambda x: convert_category_type(x, doy_start))
    return df

In [8]:
columns = ['rating','uid_c', 'bid_c', 'dow_c', 'moy_c', 'dom_c', 'woy_c', 'doy_c']

In [9]:
train, test = train_test_split(df, test_size=0.2, random_state=888)

In [10]:
train = add_time(train)
train = add_libFM_features(train)
train = train[columns]

In [11]:
train.head()

Unnamed: 0,rating,uid_c,bid_c,dow_c,moy_c,dom_c,woy_c,doy_c
1292090,5,1478:1,798187:1,838513:1,838529:1,838554:1,838605:1,838911:1
1538597,5,580468:1,817177:1,838518:1,838523:1,838537:1,838576:1,838710:1
631159,4,9829:1,762181:1,838514:1,838525:1,838533:1,838584:1,838767:1
1352505,1,298441:1,808443:1,838518:1,838522:1,838537:1,838571:1,838680:1
752459,1,75197:1,771895:1,838516:1,838528:1,838547:1,838599:1,838874:1


In [12]:
train.to_csv('/Users/yuyizhou/Documents/2017Fall/CMPT741/project/libFM/data/train.libfm', sep=' ', header=False, index=False)

In [15]:
test = add_time(test)
test = add_libFM_features(test)
test = test[columns]

In [16]:
test.head()

Unnamed: 0,rating,uid_c,bid_c,dow_c,moy_c,dom_c,woy_c,doy_c
1698523,4,135846:1,820154:1,838517:1,838530:1,838536:1,838606:1,838923:1
1663480,4,11291:1,822627:1,838518:1,838524:1,838539:1,838580:1,838743:1
513878,5,21374:1,749304:1,838513:1,838525:1,838543:1,838586:1,838778:1
1780354,4,60010:1,827435:1,838517:1,838525:1,838549:1,838586:1,838783:1
427359,5,239995:1,739418:1,838515:1,838529:1,838540:1,838603:1,838896:1


In [14]:
test.to_csv('/Users/yuyizhou/Documents/2017Fall/CMPT741/project/libFM/data/test.libfm', sep=' ', header=False, index=False)

In [17]:
# Create Group.txt
g_u = np.full((num_of_u,), 0)
g_b = np.full((num_of_b,), 1)
g_dow = np.full((7,), 2)
g_moy = np.full((12,), 3)
g_dom = np.full((31,), 4)
g_woy = np.full((52,), 5)
g_doy = np.full((365,), 6)
g_all = np.concatenate([g_u,g_b,g_dow,g_moy,g_dom,g_woy,g_doy])
df_group = pd.DataFrame(g_all)
df_group.to_csv('group.txt', index=False)