# Create Dataset

Here we have to decide how to split the dataset to create training/val/test sets.  
We decided to:
1. Create the graph with all the reviews before 2021 using reviews as nodes.
2. Then the dropped reviews are divided into positive (1) if they have 4 or 5 stars and negative (0) if they have 1 or 2 stars:
- reviews in 2021 form training and validation sets (80/20)
- reviews in 2022 form test set


In [49]:
import pandas as pd
import torch
import dgl

In [3]:
review_df = pd.read_csv('neo4j_csvs/review.csv')

In [22]:
review_ids = pd.read_csv('preprocessed/review_ids.csv')
business_ids = pd.read_csv('preprocessed/business_ids.csv')
user_ids = pd.read_csv('preprocessed/user_ids.csv')

In [11]:
review_df['date'] = pd.to_datetime(review_df['date'], infer_datetime_format=True)  

In [29]:
review_df

Unnamed: 0,:LABEL,review_id:ID,stars:float,useful:float,funny:float,cool:float,text,date
0,Review,14b7eabcf7250cac8884af3677c0b8af,3.0,0,0,0,If you decide to eat here just be aware it is ...,2018-07-07 22:09:11
1,Review,0baa75e21e14c71f057e9ad453082867,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,Review,af722966fda3d69d36a0ac9de0292e51,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,Review,50d32c50663bd2b424f57561a09cbfda,5.0,1,0,1,Wow! Yummy different delicious. Our favori...,2015-01-04 00:01:03
4,Review,a3383e594a74583aabcda6de8e3cb8a8,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...
6989553,Review,94e309064b7859538c514528e80db157,5.0,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6989554,Review,2d0aa703beebc1996c58711a6e4b04e3,5.0,2,1,2,This spot offers a great affordable east weeke...,2021-03-31 16:55:10
6989555,Review,eaac0ea0e1fb9aab851008575a49c98c,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6989556,Review,0dab15fea0a38d0f706afeed71945194,5.0,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [43]:
train_pos = review_df[(review_df['date'].dt.year == 2021) & (review_df['stars:float'] > 3)]
train_neg = review_df[(review_df['date'].dt.year == 2021) & (review_df['stars:float'] < 3)]
test_pos = review_df[(review_df['date'].dt.year == 2022) & (review_df['stars:float'] > 3)]
test_neg = review_df[(review_df['date'].dt.year == 2022) & (review_df['stars:float'] < 3)]

In [44]:
train_pos = torch.tensor(review_ids[review_ids['review_id:ID'].isin(train_pos['review_id:ID'])].index.values)
train_neg = torch.tensor(review_ids[review_ids['review_id:ID'].isin(train_neg['review_id:ID'])].index.values)

test_pos = torch.tensor(review_ids[review_ids['review_id:ID'].isin(test_pos['review_id:ID'])].index.values)
test_neg = torch.tensor(review_ids[review_ids['review_id:ID'].isin(test_neg['review_id:ID'])].index.values)

In [50]:
graph = dgl.load_graphs('preprocessed/graph.dgl')
graph = graph[0][0]

In [55]:
train_business_pos = graph.out_edges(train_pos, etype='review_to_business')[1]
train_business_neg = graph.out_edges(train_neg, etype='review_to_business')[1]
test_business_pos = graph.out_edges(test_pos, etype='review_to_business')[1]
test_business_neg = graph.out_edges(test_neg, etype='review_to_business')[1]

train_user_pos = graph.in_edges(train_pos, etype='user_to_review')[0]
train_user_neg = graph.in_edges(train_neg, etype='user_to_review')[0]
test_user_pos = graph.in_edges(test_pos, etype='user_to_review')[0]
test_user_neg = graph.in_edges(test_neg, etype='user_to_review')[0]

In [66]:
gen1 = torch.Generator().manual_seed(42)
gen2 = torch.Generator().manual_seed(42)
split_pos = torch.utils.data.random_split(range(train_user_pos.shape[0]), [0.8, 0.2], generator=gen1)
split_neg = torch.utils.data.random_split(range(train_user_neg.shape[0]), [0.8, 0.2], generator=gen2)

In [67]:
train_user_pos_final = torch.index_select(train_user_pos, 0, torch.tensor(split_pos[0]))
val_user_pos_final = torch.index_select(train_user_pos, 0, torch.tensor(split_pos[1]))

train_business_pos_final = torch.index_select(train_business_pos, 0, torch.tensor(split_pos[0]))
val_business_pos_final = torch.index_select(train_business_pos, 0, torch.tensor(split_pos[1]))

In [68]:
train_user_neg_final = torch.index_select(train_user_neg, 0, torch.tensor(split_neg[0]))
val_user_neg_final = torch.index_select(train_user_neg, 0, torch.tensor(split_neg[1]))

train_business_neg_final = torch.index_select(train_business_neg, 0, torch.tensor(split_neg[0]))
val_business_neg_final = torch.index_select(train_business_neg, 0, torch.tensor(split_neg[1]))

In [80]:
train_pos = (train_user_pos_final, train_business_pos_final)
train_neg = (train_user_neg_final, train_business_neg_final)

val_pos = (val_user_pos_final, val_business_pos_final)
val_neg = (val_user_neg_final, val_business_neg_final)

test_pos = (test_user_pos, test_business_pos)
test_neg = (test_user_neg, test_business_neg)

In [56]:
graph = dgl.remove_nodes(graph, torch.tensor(review_ids[review_ids['review_id:ID'].isin(review_df[review_df['date'].dt.year >= 2021]['review_id:ID'])].index.values), ntype='review')

In [58]:
graph.num_nodes

<bound method DGLGraph.num_nodes of Graph(num_nodes={'business': 150243, 'category': 1311, 'review': 6339837, 'tip': 908878, 'user': 1987897},
      num_edges={('business', 'business_has_category', 'category'): 668592, ('review', 'review_to_business', 'business'): 6339837, ('tip', 'tip_to_business', 'business'): 908878, ('user', 'user_to_review', 'review'): 6339837, ('user', 'user_to_tip', 'tip'): 908878, ('user', 'user_to_user', 'user'): 437928},
      metagraph=[('business', 'category', 'business_has_category'), ('review', 'business', 'review_to_business'), ('tip', 'business', 'tip_to_business'), ('user', 'review', 'user_to_review'), ('user', 'tip', 'user_to_tip'), ('user', 'user', 'user_to_user')])>

In [81]:
import pickle
dgl.save_graphs('training_data/graph.dgl', [graph])

with open('training_data/train.obj', 'wb') as fp:
	pickle.dump({'pos': train_pos, 'neg': train_neg}, fp)

with open('training_data/val.obj', 'wb') as fp:
	pickle.dump({'pos': val_pos, 'neg': val_neg}, fp)

with open('training_data/test.obj', 'wb') as fp:
	pickle.dump({'pos': test_pos, 'neg': test_neg}, fp)