# Getting Datasets from Pytorch Geometric

In [4]:
from torch_geometric.datasets import AmazonBook

In [7]:
dataset = AmazonBook(root = './amazonbook')

Processing...
Done!


In [None]:
dataset[0]

In [None]:
dataset.num_features

In [None]:
# Load the processed data
print(dataset[0])

# Check the node types and number of nodes
print(f"User nodes: {dataset[0]['user'].num_nodes}")
print(f"Book nodes: {dataset[0]['book'].num_nodes}")

In [None]:

# Inspect the edge indices (relationship between users and books)
print(f"Edge index (user -> book): \n {dataset[0]['user', 'rates', 'book']['edge_index']}")
print(f"Edge index (book -> user): \n {dataset[0]['book', 'rated_by', 'user']['edge_index']}")


In [9]:
data = dataset[0]

In [None]:
print("Node types:", data.node_types)
print("Edge types:", data.edge_types)


In [None]:
user_to_book_edge_index = data['user', 'rates', 'book']['edge_index']
print('Edge index (user -> book):\n', user_to_book_edge_index)


In [8]:
from torch_geometric.datasets import MovieLens1M

In [9]:
df = MovieLens1M(root = './movielens')

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip
Extracting movielens/ml-1m.zip
Processing...
Done!


In [11]:
movies = df[0]

In [12]:
movies

HeteroData(
  movie={ x=[3883, 18] },
  user={ x=[6040, 30] },
  (user, rates, movie)={
    edge_index=[2, 1000209],
    rating=[1000209],
    time=[1000209],
  },
  (movie, rated_by, user)={
    edge_index=[2, 1000209],
    rating=[1000209],
    time=[1000209],
  }
)

# Connecting to Google Big Query 

In [27]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [30]:
key_path = "cpsc483-e259a971f86e.json"
credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(credentials=credentials, project=credentials.project_id)


In [31]:
dataset_id = "final_project"
table_id = "taobao"

In [32]:
table_ref = client.dataset(dataset_id).table(table_id)

In [33]:
table = client.get_table(table_ref)

In [37]:
query = "Select * from cpsc483.final_project.taobao limit 10000"

In [38]:
query_job = client.query(query)  # Make an API request to execute the query
df = query_job.to_dataframe() 

In [39]:
df

Unnamed: 0,int64_field_0,int64_field_1,int64_field_2,string_field_3,int64_field_4
0,295176,588711,721664,pv,1511866603
1,398393,3214715,721664,pv,1511969910
2,139186,1112150,721664,pv,1511894242
3,42193,2503879,721664,pv,1511671762
4,17532,4619479,721664,pv,1512054627
...,...,...,...,...,...
9995,797396,383797,4244487,pv,1511875079
9996,938143,4806800,4244487,pv,1511938787
9997,569927,3593392,4244487,pv,1511624257
9998,578059,3706904,4244487,pv,1512046788
