In [1]:
import numpy as np
import pandas as pd
from config import *

In [2]:
def filter_interaction(df_inter):
    print('origin shape: ', df_inter.shape)
    vc_user = df_inter['user_id'].value_counts()
    ls_user = vc_user[vc_user >= 3].index
    mask = df_inter['user_id'].isin(ls_user)
    df_inter = df_inter.loc[mask].reset_index(drop=True)
    print('> 3 interactions: ', df_inter.shape)
    return df_inter, ls_user

In [3]:
def multi_hot(df, by, key):
    df = df.groupby(by=by)[key].apply(set).apply(list).reset_index()
    df = df[[by]].join(df[key].str.join('|').str.get_dummies().add_prefix(key))
    return df

## douban book
### interactinos

In [4]:
df_douban_inter = pd.read_csv(data_path_douban + 'user_book.dat', sep='\t', header=None, dtype={0: str, 1: str}).rename({
    0: 'user_id',
    1: 'item_id',
    2: 'rating',
}, axis=1)[['user_id', 'item_id', 'rating']]
df_douban_inter, ls_user_douban = filter_interaction(df_douban_inter)
df_douban_inter.to_csv(data_preprocessing + "douban_inter.csv", index=False)

origin shape:  (792062, 3)
> 3 interactions:  (790197, 3)


### user features

In [8]:
df_douban_location = pd.read_csv(data_path_douban + 'user_location.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'user_id', 1: 'location'})
df_douban_location = df_douban_location.loc[(df_douban_location['user_id'].isin(ls_user_douban))].reset_index(drop=True)
print(df_douban_location.shape)
print(df_douban_location.drop_duplicates(subset='user_id').shape)
df_douban_location.head(3)

(9614, 2)
(9614, 2)


Unnamed: 0,user_id,location
0,3587,33
1,3210,179
2,7993,394


In [9]:
# multi hot 不使用
# df_douban_group = pd.read_csv(data_path_douban + 'user_group.dat', sep='\t', header=None, dtype={0: str, 1: str})
# df_douban_group = df_douban_group.rename(columns={0: 'user_id', 1: 'group'})
# df_douban_group = df_douban_group.loc[(df_douban_group['user_id'].isin(ls_user_douban))].reset_index(drop=True)

# df_douban_group = multi_hot(df_douban_group, by='user_id', key='group')
# df_douban_group.to_csv(data_preprocessing + 'douban_group.csv', index=False)
# print(df_douban_group.shape)
# df_douban_group

In [10]:
df_douban_user_features = df_douban_location
df_douban_user_features.to_csv(data_preprocessing + 'douban_user_features.csv', index=False)

### item features

In [11]:
df_douban_author = pd.read_csv(data_path_douban + 'book_author.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'item_id', 1: 'author'})
print(df_douban_author.shape)
print(df_douban_author.drop_duplicates(subset='item_id', keep='last').shape)
df_douban_author.head(3)

(21907, 2)
(21907, 2)


Unnamed: 0,item_id,author
0,12131,3871
1,20995,10690
2,9905,3845


In [12]:
df_douban_publisher = pd.read_csv(data_path_douban + 'book_publisher.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'item_id', 1: 'publisher'})
print(df_douban_publisher.shape)
print(df_douban_publisher.drop_duplicates(subset='item_id', keep='last').shape)
df_douban_publisher.head(3)

(21773, 2)
(21773, 2)


Unnamed: 0,item_id,publisher
0,12131,108
1,20995,1470
2,9905,1696


In [13]:
df_douban_year = pd.read_csv(data_path_douban + 'book_year.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'item_id', 1: 'year'})
print(df_douban_year.shape)
print(df_douban_year.drop_duplicates(keep='last').shape)
df_douban_year.head(3)

(21192, 2)
(21192, 2)


Unnamed: 0,item_id,year
0,9905,16
1,21153,15
2,12823,15


In [14]:
df_douban_item_features = pd.merge(df_douban_author, df_douban_publisher, on='item_id', how='outer')
df_douban_item_features = pd.merge(df_douban_item_features, df_douban_year, on='item_id', how='outer')
df_douban_item_features = df_douban_item_features.fillna(-1)
df_douban_item_features.to_csv(data_preprocessing + 'douban_item_features.csv', index=False)
df_douban_item_features

Unnamed: 0,item_id,author,publisher,year
0,12131,3871,108,39
1,20995,10690,1470,20
2,9905,3845,1696,16
3,21153,10712,708,15
4,12823,6297,961,15
...,...,...,...,...
22041,7130,-1,-1,15
22042,4199,-1,-1,15
22043,11072,-1,-1,15
22044,17992,-1,-1,18


## movie lens

In [15]:
df_movie_inter = pd.read_csv(data_path_movie + 'user_movie.dat', sep='\t', header=None, dtype={0: str, 1: str}).rename({
    0: 'user_id',
    1: 'item_id',
    2: 'rating',
}, axis=1)
df_movie_inter = df_movie_inter[['user_id', 'item_id', 'rating']]
df_movie_inter, ls_user_movie  = filter_interaction(df_movie_inter)

df_movie_inter.to_csv(data_preprocessing + "movie_inter.csv", index=False)

origin shape:  (100000, 3)
> 3 interactions:  (100000, 3)


### user features

In [16]:
df_movie_age = pd.read_csv(data_path_movie + "user_age.dat", sep='\t', header=None, dtype={0: str, 1: int})\
    .rename(columns={0: 'user_id', 1: 'age'})
df_movie_age = df_movie_age.loc[df_movie_age['user_id'].isin(ls_user_movie)].reset_index(drop=True)
print(df_movie_age.shape)
print(df_movie_age.drop_duplicates(subset='user_id', keep='last').shape)
df_movie_age.head(3)

(943, 2)
(943, 2)


Unnamed: 0,user_id,age
0,1,3
1,2,6
2,3,3


In [17]:
df_movie_occupation = pd.read_csv(data_path_movie + "user_occupation.dat", sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'user_id', 1: 'occupation'})
df_movie_occupation = df_movie_occupation.loc[df_movie_occupation['user_id'].isin(ls_user_movie)].reset_index(drop=True)
print(df_movie_occupation.shape)
print(df_movie_occupation.drop_duplicates(subset='user_id', keep='last').shape)
df_movie_occupation.head(3)

(943, 2)
(943, 2)


Unnamed: 0,user_id,occupation
0,1,1
1,2,2
2,3,3


In [18]:
df_movie_user_features = pd.merge(df_movie_age, df_movie_occupation, on='user_id', how='outer')
df_movie_user_features = df_movie_user_features.fillna(-1)
df_movie_user_features.to_csv(data_preprocessing + 'movie_user_features.csv', index=False)
df_movie_user_features

Unnamed: 0,user_id,age,occupation
0,1,3,1
1,2,6,2
2,3,3,3
3,4,3,1
4,5,4,2
...,...,...,...
938,939,3,6
939,940,4,5
940,941,3,6
941,942,5,12


### item features 

In [19]:
df_movie_genre = pd.read_csv(data_path_movie + 'movie_genre.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'item_id', 1: 'genre'})
df_movie_genre = multi_hot(df_movie_genre, by='item_id', key='genre')
print(df_movie_genre.shape)
print(df_movie_genre.drop_duplicates(subset='item_id').shape)
df_movie_genre.head(3)

(1680, 19)
(1680, 19)


Unnamed: 0,item_id,genre1,genre10,genre11,genre12,genre13,genre14,genre15,genre16,genre17,genre18,genre2,genre3,genre4,genre5,genre6,genre7,genre8,genre9
0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0
1,10,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,100,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0


In [20]:
df_movie_item_features = df_movie_genre
df_movie_item_features.to_csv(data_preprocessing + 'movie_item_features.csv', index=False)

## yelp

In [21]:
df_yelp_inter = pd.read_csv(data_path_yelp + 'user_business.dat', sep='\t', header=None, dtype={0: str, 1: str}).rename({
    0: 'user_id',
    1: 'item_id',
    2: 'rating',
}, axis=1)
df_yelp_inter = df_yelp_inter[['user_id', 'item_id', 'rating']]
df_yelp_inter, ls_user_yelp = filter_interaction(df_yelp_inter)

df_yelp_inter.to_csv(data_preprocessing + "yelp_inter.csv", index=False)

origin shape:  (198397, 3)
> 3 interactions:  (188456, 3)


### user features

In [22]:
df_yelp_compliment = pd.read_csv(data_path_yelp + 'user_compliment.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'user_id', 1: 'compliment'})\
    .drop(columns=2)

df_yelp_compliment = df_yelp_compliment.loc[df_yelp_compliment['user_id'].isin(ls_user_yelp)].reset_index(drop=True)
df_yelp_compliment = multi_hot(df_yelp_compliment, by='user_id', key='compliment')
print(df_yelp_compliment.shape)
print(df_yelp_compliment.drop_duplicates(subset=['user_id'], keep='last').shape)

df_yelp_compliment.head(3)

(7571, 12)
(7571, 12)


Unnamed: 0,user_id,compliment1,compliment10,compliment11,compliment2,compliment3,compliment4,compliment5,compliment6,compliment7,compliment8,compliment9
0,10,0,0,0,0,1,1,0,0,0,0,1
1,10000,1,1,1,1,1,1,0,1,1,1,1
2,10002,0,0,0,0,0,1,0,0,0,0,0


In [23]:
df_yelp_user_features = df_yelp_compliment.to_csv(data_preprocessing + 'yelp_user_features.csv', index=False)

### item features

In [24]:
df_yelp_city = pd.read_csv(data_path_yelp + 'business_city.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'item_id', 1: 'city'})\
    .drop(columns=[2])

df_yelp_city = multi_hot(df_yelp_city, by='item_id', key='city')

print(df_yelp_city.shape)
print(df_yelp_city.drop_duplicates(subset='item_id').shape)
df_yelp_city.head(3)

(14267, 48)
(14267, 48)


Unnamed: 0,item_id,city1,city10,city11,city12,city13,city14,city15,city16,city17,...,city43,city44,city45,city46,city47,city5,city6,city7,city8,city9
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df_yelp_cat = pd.read_csv(data_path_yelp + 'business_category.dat', sep='\t', header=None, dtype={0: str, 1: str})\
    .rename(columns={0: 'item_id', 1: 'category'})\
    .drop(columns=[2])

df_yelp_cat = multi_hot(df_yelp_cat, by='item_id', key='category')

print(df_yelp_cat.shape)
print(df_yelp_cat.drop_duplicates(subset='item_id').shape)
df_yelp_cat.head(3)

(14180, 512)
(14180, 512)


Unnamed: 0,item_id,category1,category10,category100,category101,category102,category103,category104,category105,category106,...,category90,category91,category92,category93,category94,category95,category96,category97,category98,category99
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_yelp_item_features = pd.merge(df_yelp_city, df_yelp_cat, on='item_id', how='outer')
df_yelp_item_features = df_yelp_item_features.fillna(0)
df_yelp_item_features.to_csv(data_preprocessing + 'yelp_item_features.csv', index=False)
df_yelp_item_features

Unnamed: 0,item_id,city1,city10,city11,city12,city13,city14,city15,city16,city17,...,category90,category91,category92,category93,category94,category95,category96,category97,category98,category99
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14279,5776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14280,6182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14281,6582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14282,802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
