# Getting Datasets from Pytorch Geometric

In [4]:
from torch_geometric.datasets import AmazonBook

In [7]:
dataset = AmazonBook(root = './amazonbook')

Processing...
Done!


In [None]:
dataset[0]

In [None]:
dataset.num_features

In [None]:
# Load the processed data
print(dataset[0])

# Check the node types and number of nodes
print(f"User nodes: {dataset[0]['user'].num_nodes}")
print(f"Book nodes: {dataset[0]['book'].num_nodes}")

In [None]:

# Inspect the edge indices (relationship between users and books)
print(f"Edge index (user -> book): \n {dataset[0]['user', 'rates', 'book']['edge_index']}")
print(f"Edge index (book -> user): \n {dataset[0]['book', 'rated_by', 'user']['edge_index']}")


In [9]:
data = dataset[0]

In [None]:
print("Node types:", data.node_types)
print("Edge types:", data.edge_types)


In [None]:
user_to_book_edge_index = data['user', 'rates', 'book']['edge_index']
print('Edge index (user -> book):\n', user_to_book_edge_index)


In [8]:
from torch_geometric.datasets import MovieLens1M

In [9]:
df = MovieLens1M(root = './movielens')

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip
Extracting movielens/ml-1m.zip
Processing...
Done!


In [11]:
movies = df[0]

In [12]:
movies

HeteroData(
  movie={ x=[3883, 18] },
  user={ x=[6040, 30] },
  (user, rates, movie)={
    edge_index=[2, 1000209],
    rating=[1000209],
    time=[1000209],
  },
  (movie, rated_by, user)={
    edge_index=[2, 1000209],
    rating=[1000209],
    time=[1000209],
  }
)

In [14]:
from torch_geometric.datasets import Taobao

In [15]:
dataset = Taobao(root='taobao', force_reload=True)

Downloading https://alicloud-dev.oss-cn-hangzhou.aliyuncs.com/UserBehavior.csv.zip
Extracting taobao/raw/UserBehavior.csv.zip
Processing...
Done!


# Connecting to Google Big Query 

In [40]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [41]:
key_path = "cpsc483-49e73bf4d565.json"
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials=credentials, project=credentials.project_id)


In [42]:
dataset_id = "final_project"
table_id = "taobao"

In [43]:
table_ref = client.dataset(dataset_id).table(table_id)

In [44]:
table = client.get_table(table_ref)

In [45]:
query = "Select * from cpsc483.final_project.taobao limit 10000"

In [46]:
query_job = client.query(query)  # Make an API request to execute the query

In [47]:
df = query_job.to_dataframe() 



In [48]:
df

Unnamed: 0,userId,itemId,categoryId,behaviorType,timestamp
0,580045,4664713,721664,pv,1512276886
1,832241,808112,721664,pv,1512282159
2,575635,854934,721664,pv,1511710445
3,805537,1217839,721664,pv,1512287749
4,904909,3211819,721664,pv,1511705511
...,...,...,...,...,...
9995,299908,2608238,4244487,pv,1512227359
9996,746029,4897873,4244487,pv,1512138805
9997,952799,261976,4244487,pv,1511628515
9998,551540,3379970,4244487,pv,1512122057


# Node Degree Distribution 

Out-Degree for Users: Number of items each user has interacted with.

In-Degree for Items (from Users): Number of users who have interacted with each item.

Out-Degree for Items: Number of categories each item belongs to.

In-Degree for Categories: Number of items in each category.


In [52]:
outdegreeUsersQ = """
SELECT userId, COUNT(DISTINCT itemId) AS out_degree
FROM cpsc483.final_project.taobao
GROUP BY userId;
"""

In [53]:
query_job = client.query(outdegreeUsersQ) 
outdegreeUsers = query_job.to_dataframe() 



In [54]:
outdegreeUsers.shape

(987994, 2)

In [57]:
outdegreeUsers.sort_values(by = 'out_degree', ascending = False)

Unnamed: 0,userId,out_degree
108371,435009,710
297571,208813,699
256158,503757,698
105645,419460,687
239005,54206,681
...,...,...
294793,489899,1
179696,195014,1
65176,190075,1
294601,350058,1


In [60]:
import numpy as np

In [61]:
np.mean(outdegreeUsers['out_degree'])

76.89281412640157

In [62]:
np.median(outdegreeUsers['out_degree'])

58.0