### Overview
##### In this file, we handle complex transformations and feature engineering from the processed BiqQuery datasets, and store the resulting training examples in Cloud Storage as TFRecords.

### Retrieve datasets from BigQuery

In [None]:
import tensorflow as tf
import pandas as pd
from google.cloud import bigquery
bigquery_client = bigquery.Client()

#Write Query on BQ
USERS_QUERY = "SELECT * FROM `teamu-542ac.user_dataset.users`"
POSTS_QUERY = "SELECT * FROM `teamu-542ac.post_dataset.posts`"
INTERACTIONS_QUERY = "SELECT * FROM `teamu-542ac.interaction_dataset.user_post_interactions`"

#Run the queries and write results to a pandas data frames
Query_Results = bigquery_client.query(USERS_QUERY)
users_df = Query_Results.to_dataframe()

Query_Results = bigquery_client.query(POSTS_QUERY)
posts_df = Query_Results.to_dataframe()

Query_Results = bigquery_client.query(INTERACTIONS_QUERY)
interactions_df = Query_Results.to_dataframe()

### NaN replacements and datetime type conversion

In [None]:
# Fill any NaN values (users without collaborations) with an empty list
users_df['project_titles'] = users_df['project_titles'].apply(lambda x: x if isinstance(x, list) else [])

# Fill any NaN values (posts without comments) with an empty list
posts_df['comments'] = posts_df['comments'].apply(lambda x: x if isinstance(x, list) else [])

# Convert created_at timestamp to hour of day and day of week
interactions_df['hour_of_day'] = pd.to_datetime(interactions_df['interaction_time']).dt.hour 
interactions_df['day_of_week'] = pd.to_datetime(interactions_df['interaction_time']).dt.weekday # 0 = Monday, 6 = Sunday

# Replace None values in passions, project titles, and bio with empty strings
users_df['passions'] = users_df['passions'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
users_df['project_titles'] = users_df['project_titles'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
users_df['bio'] = users_df['bio'].apply(lambda x: str(x) if x else '')

### Tokenize string data with BERT and pad token sequences

In [None]:
# Initialize the BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize text fields using the BERT tokenizer
def tokenize_with_bert(texts, max_length):
    """
    Tokenizes input texts using the BERT tokenizer.
    
    Args:
    texts (list of str): The input text data.
    max_length (int): The maximum length of tokenized sequences.

    Returns:
    dict: A dictionary containing token ids and attention masks.
    """
    tokens = bert_tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return tokens

# Tokenize passions, project_titles, and bio using BERT tokenizer
passions_texts = users_df['passions'].tolist()
project_titles_texts = users_df['project_titles'].tolist()
bio_texts = users_df['bio'].tolist()

# Generate tokens for each field with the maximum length set according to your requirements
max_length_passions = 10
max_length_project_titles = 10
max_length_bio = 50

passions_tokens = tokenize_with_bert(passions_texts, max_length=max_length_passions)
project_titles_tokens = tokenize_with_bert(project_titles_texts, max_length=max_length_project_titles)
bio_tokens = tokenize_with_bert(bio_texts, max_length=max_length_bio)

# For padded sequences, use the 'input_ids' returned by the tokenizer
passions_padded = passions_tokens['input_ids']
project_titles_padded = project_titles_tokens['input_ids']
bio_padded = bio_tokens['input_ids']

### Processing Post Data with BERT ###

# Get post title, description, and comments text data
title_texts = posts_df['title'].tolist()
description_texts = posts_df['description'].tolist()
comments_texts = [" ".join(comment_list) if isinstance(comment_list, list) else "" for comment_list in posts_df['comments']]

# Generate tokens for post fields
max_length_title = 10
max_length_description = 100
max_length_comments = 100

title_tokens = tokenize_with_bert(title_texts, max_length=max_length_title)
description_tokens = tokenize_with_bert(description_texts, max_length=max_length_description)
comments_tokens = tokenize_with_bert(comments_texts, max_length=max_length_comments)

# For padded sequences, use the 'input_ids' returned by the tokenizer
title_padded = title_tokens['input_ids']
description_padded = description_tokens['input_ids']
comments_padded = comments_tokens['input_ids']

### Apply RBF transformation to user location data

In [None]:
from shapely.geometry import Point
from shapely.wkt import loads as load_wkt
from shapely import wkb
import numpy as np
from sklearn.preprocessing import StandardScaler

# Ensure that lat_long is either Point object or default Point
users_df['lat_long'] = users_df['lat_long'].apply(lambda wkb_hex: wkb.loads(bytes.fromhex(wkb_hex)) if pd.notnull(wkb_hex) else Point(1, 1))

# Extract latitude and longitude from the Point objects
users_df['latitude'] = users_df['lat_long'].apply(lambda loc: loc.y if loc else None)
users_df['longitude'] = users_df['lat_long'].apply(lambda loc: loc.x if loc else None)

# Convert latitude and longitude to H3 index for geospatial representation (optional)
users_df['h3_location'] = users_df.apply(
    lambda row: h3.latlng_to_cell(row['latitude'], row['longitude'], 8) if pd.notnull(row['latitude']) and pd.notnull(row['longitude']) else None,
    axis=1
)

# Step 1: Apply Standard Scaling to Latitude and Longitude
scaler = StandardScaler()
users_df[['latitude', 'longitude']] = scaler.fit_transform(users_df[['latitude', 'longitude']])

# Step 2: Apply Radial Basis Function (RBF) Transformation
def rbf_transform(lat, lon, centers, gamma=0.1):
    """Apply an RBF transformation to latitude and longitude coordinates.
    
    Args:
        lat (float): Latitude value.
        lon (float): Longitude value.
        centers (np.array): Centers for RBF, which could be sampled points within the dataset.
        gamma (float): The gamma value that controls the spread of the RBF.
        
    Returns:
        np.array: RBF-transformed features.
    """
    coords = np.array([lat, lon])
    distances = np.linalg.norm(centers - coords, axis=1)
    return np.exp(-gamma * distances ** 2)

# Step 3: Choose RBF Centers
# We choose a few centers based on representative latitude and longitude values in the dataset
num_centers = 5
centers = users_df[['latitude', 'longitude']].sample(n=num_centers).to_numpy()

# Step 4: Apply RBF Transformation to All Users
users_df['h3_location'] = users_df.apply(
    lambda row: rbf_transform(row['latitude'], row['longitude'], centers), axis=1
)

### Post numerical feature normalization and further NaN filling

In [None]:
# Normalize numerical features (vote count, upvote count, view count, post length, comment count, avg_time_viewed)
scaler = StandardScaler()

posts_df[['post_length', 'vote_count', 'upvote_count', 'view_count', 'comment_count', 'avg_time_viewed']] = scaler.fit_transform(
    posts_df[['post_length', 'vote_count', 'upvote_count', 'view_count', 'comment_count', 'avg_time_viewed']]
)

# Fill NaN values with 0 for users with no interactions
users_df[['viewed_posts', 'upvoted_posts', 'commented_posts']] = users_df[['viewed_posts', 'upvoted_posts', 'commented_posts']].fillna(0)

# Fill missing values in numerical columns with 0
interactions_df["view_duration_secs"] = interactions_df["view_duration_secs"].fillna(0)
interactions_df["comment_length"] = interactions_df["comment_length"].fillna(0)
interactions_df["hour_of_day"] = interactions_df["hour_of_day"].fillna(0)
interactions_df["day_of_week"] = interactions_df["day_of_week"].fillna(0)

# Fill missing interaction types with a placeholder (e.g., 'unknown')
interactions_df["interaction_type"] = interactions_df["interaction_type"].fillna('unknown')

### Making sure to assign padded embeddings to dataframes

In [None]:
users_df['passions'] = passions_padded
users_df['project_titles'] = project_titles_padded
users_df['bio'] = bio_padded

posts_df['title'] = title_padded
posts_df['description'] = description_padded
posts_df['comments'] = comments_padded

### Use tfrecorder library to convert dataframes to TFRecords and store in GCS buckets

In [None]:
import tfrecorder

users_df.tensorflow.to_tfr(
    output_dir='/user_bucket',
    project='teamu-542ac',
    region='us-central1')

posts_df.tensorflow.to_tfr(
    output_dir='/post_bucket',
    project='teamu-542ac',
    region='us-central1')

interactions_df.tensorflow.to_tfr(
    output_dir='/interaction_bucket',
    project='teamu-542ac',
    region='us-central1')