# Load Libraries

In [None]:
import os
import numpy as np
from sklearn.datasets import load_svmlight_file
# from tensorflow_serving.apis import input_pb2 # KEEP THIS COMMENTED OUT FIRST!!
import tensorflow as tf
import tensorflow_ranking as tfr

In [None]:
import modin.pandas as pd

In [None]:
from tensorflow_serving.apis import input_pb2

# Feature List Extraction and Data Preprocessing

In [None]:
def preprocess_features(features_path):
    
    '''This function processes and creates our feature columns descriptions'''
    
    # Read in the features file
    features = pd.read_csv(features_path)
    
    # Create new header and replace spaces with underscore
    new_header = features.iloc[0].str.replace(' ','_')
    
    # Remove the first row which is now the new header
    features = features[1:]
    
    # Set new headers
    features.columns = new_header
    
    # Only the first cell for each category is filled. Using forward will
    # will allow me to map each category to their sub-categories located
    # in the stream column 
    features['feature_description'] = features['feature_description'].ffill()
    
    # Replacing characters to allign with TensorFlows regex requirements
    character_removal = [' ', '(', ')', '*']
    for char in character_removal:
        features['feature_description'] = features['feature_description'].str.replace(char, '_')
        features['stream'] = features['stream'].astype(str).str.replace(char, '_')
        
    # Setting column type to string for mapping within the load_rename_save function
    features['feature_id'] = features['feature_id'].astype(str)
    
    # Creating new column to map features to existing dataset
    features['cols'] = 'string'
    
    # Looping over all features and creating new column name
    for idx in range(len(features)):
        if str(features.iloc[idx]['stream']) != 'nan':
            features['cols'].iloc[idx] = features['feature_description'].iloc[idx] + '_' + features['stream'].iloc[idx]
        else:
            features['cols'].iloc[idx] = features['feature_description'].iloc[idx]
    
    return features

In [None]:
# Get the current working directory
current_working_directory = os.getcwd()

# Construct the path to the features.csv file
features_path = os.path.join(current_working_directory, "data", "features.csv")

# Run feature preprocessing
features_df=preprocess_features(features_path)

In [None]:
def replace_relevance_qid(df):
    # Rename the first two columns
    df.rename(columns={0: 'relevance', 1: 'qid'}, inplace=True)
    
    # Rename the rest of the columns
    new_column_names = {i: i-1 for i in df.columns if isinstance(i, int) and i not in [0, 1]}
    df.rename(columns=new_column_names, inplace=True)
    
    return df

def drop_column_137(df):
    # Check if column 137 exists in the dataframe
    if 137 in df.columns:
        df.drop(columns=[137], inplace=True)
    else:
        print("Column 137 does not exist in the dataframe.")
    return df

def rename_cols(df, features):
        
    for col in df.columns:
        
        if col != 'relevance' and col != 'qid':
        
            associated_col_value=features.loc[features['feature_id'] == str(col), 'cols'][0]

            df.rename(columns={col:associated_col_value}, inplace=True)

    return df

def replace_colon_values(df):
    # Iterate through each cell in the DataFrame
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x.split(':')[1] if isinstance(x, str) and ':' in x else x)
    return df


def full_preprocess_pipeline(df, features):
    
    # Rename cols 0 and 1 to relevancy and qid
    df=replace_relevance_qid(df)
    
    # Drop column 137 due to entirely Null values
    df=drop_column_137(df)
    
    # Rename columns using feature dataframe
    df=rename_cols(df, features)
    
    # Remove colons
    df=replace_colon_values(df)
    
    return df


In [None]:
# Base directory path
data_dir = os.path.join(current_working_directory, "data")

# Folders within the base directory
folders = [f'Fold{i}' for i in range(1, 6)]

# Process each file in each folder
for folder in folders:
    folder_path = os.path.join(data_dir, folder)
    for filename in os.listdir(folder_path):
        
        print(f"On: {filename}")
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and file_path.endswith('.txt'):
        
            # Read the file
            df = pd.read_csv(file_path, sep=" ", header=None)
            
            # Preprocess the dataframe
            df = full_preprocess_pipeline(df, features)
            
            print(df.head())
            
            # Save the preprocessed dataframe
            preprocessed_file_path = file_path.replace('.txt', '_preprocessed.csv')
            
            df.to_csv(preprocessed_file_path, index=False)


# Building TFRecords

In [None]:
def output_to_path(df, file_name):

    # Construct the path to the features.csv file
    combined_path = os.path.join(current_working_directory, "data", "Combined")
    
    # Check if the directory exists, create it if it doesn't
    if not os.path.exists(combined_path):
        os.makedirs(combined_path)
    
    df.to_csv(os.path.join(combined_path, file_name))

In [None]:
# Read in all the folds and their train/val/test preprocessed splits
fold_path = os.path.join(current_working_directory, "data")

f1_train_df=pd.read_csv(f"{fold_path}/Fold1/train_preprocessed.csv")
f1_val_df=pd.read_csv(f"{fold_path}/Fold1/vali_preprocessed.csv")
f1_test_df=pd.read_csv(f"{fold_path}/Fold1/test_preprocessed.csv")

f2_train_df=pd.read_csv(f"{fold_path}/Fold2/train_preprocessed.csv")
f2_val_df=pd.read_csv(f"{fold_path}/Fold2/vali_preprocessed.csv")
f2_test_df=pd.read_csv(f"{fold_path}/Fold2/test_preprocessed.csv")

f3_train_df=pd.read_csv(f"{fold_path}/Fold3/train_preprocessed.csv")
f3_val_df=pd.read_csv(f"{fold_path}/Fold3/vali_preprocessed.csv")
f3_test_df=pd.read_csv(f"{fold_path}/Fold3/test_preprocessed.csv")

f4_train_df=pd.read_csv(f"{fold_path}/Fold4/train_preprocessed.csv")
f4_val_df=pd.read_csv(f"{fold_path}/Fold4/vali_preprocessed.csv")
f4_test_df=pd.read_csv(f"{fold_path}/Fold4/test_preprocessed.csv")

f5_train_df=pd.read_csv(f"{fold_path}/Fold5/train_preprocessed.csv")
f5_val_df=pd.read_csv(f"{fold_path}/Fold5/vali_preprocessed.csv")
f5_test_df=pd.read_csv(f"{fold_path}/Fold5/test_preprocessed.csv")

In [None]:
# Combine each split into a train/val/test dataframe
train_df=pd.concat([f1_train_df, f2_train_df, f3_train_df], ignore_index=True, axis=0).reset_index(drop=True)
val_df=pd.concat([f1_val_df, f2_val_df, f3_val_df], ignore_index=True, axis=0).reset_index(drop=True)
test_df=pd.concat([f1_test_df, f2_test_df, f3_test_df], ignore_index=True, axis=0).reset_index(drop=True)


In [None]:
# Combine validation and test datasets
train_df=pd.concat([train_df, val_df], ignore_index=True, axis=0).reset_index(drop=True)

# Output these to a directory 'Combined'
output_to_path(train_df, "train.csv")
output_to_path(test_df, "test.csv")

In [None]:
train_df=pd.read_csv(f"{fold_path}/Combined/train.csv", index_col=0)
test_df=pd.read_csv(f"{fold_path}/Combined/test.csv", index_col=0)

# Build Tensorflow Records

In [None]:
def float_feature(value):
    """Tensorflow feature values must be in this list form"""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def int64_feature(value):
    """Tensorflow feature values must be in this list form"""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def build_tfrs(df, features_df, file_name):
    
    relevance_array=np.expand_dims(df['relevance'], axis=1)
    qid_array=np.expand_dims(df['qid'], axis=1)
    features_array = df.iloc[:, 2:].to_numpy()

    df_tfr = np.concatenate((relevance_array, \
                                          qid_array, \
                                          features_array), axis=1)
    
    # Extract column names
    labels = np.array(features_df['cols'])

    options = tf.io.TFRecordOptions(compression_type='GZIP')
    
    output_file_path="/Users/malik/Desktop/Kaggle/learn_to_rank/data/Combined"
    
    output_file_path=os.path.join(output_file_path, file_name)
    
    with tf.io.TFRecordWriter(output_file_path) as writer:
        
        # Create Example list 
        elwc = input_pb2.ExampleListWithContext()
        
        # Save the last query id for filtering
        last_query_id = None
        
        # Df length
        df_len=len(df_tfr)

        for row in range(df_tfr.shape[0]):

            # Select data from each row
            relevance_label, query_id, features = df_tfr[row,0],df_tfr[row,1],df_tfr[row,2:]

            # Create Example Dict - mapping each feature to its value
            example_dict = {
               f'{feat_name}':float_feature(feat_val)
               for feat_name, feat_val in zip(labels, features)
            }

            # Add in the relevance label as a int64
            example_dict['relevance_label'] = int64_feature(int(relevance_label))

            # Create Features
            example = tf.train.Example(features=tf.train.Features(
                                                     feature=example_dict)
                                             )

            # If its a new qid in the iteration
            if query_id != last_query_id:

                # If its not the first qid iteration - write the object to the file
                if last_query_id != None:
                    writer.write(elwc.SerializeToString())

                # Reset the new qid as the last qid
                last_query_id = query_id

                # Create the example object
                elwc = input_pb2.ExampleListWithContext()

                # Append to the example object the example we built
                elwc.examples.append(example)


            # If its the same qid, append to that existing example object, the example
            else:
                elwc.examples.append(example)
        
        # Writing the final query
        writer.write(elwc.SerializeToString())
        
        print ("Done outputing to tfrecord")
    

In [None]:
build_tfrs(train_df, features_df, "train.tfrecords")
build_tfrs(test_df, features_df, "test.tfrecords")

# Build Out Pre-training Pipeline

In [None]:
# ============ INPUT CREATOR ====================

context_feature_spec = {}

example_spec = {feat: tf.io.FixedLenFeature(shape=(1,), \
                        dtype=tf.float32, default_value=0.0) \
                        for feat in list(features_df['cols'])}

label_spec = ('relevance_label', \
                tf.io.FixedLenFeature(shape=(1,), \
                dtype=tf.int64, \
                default_value=-1))

input_creator = tfr.keras.model.FeatureSpecInputCreator(
    context_feature_spec, example_spec)

In [None]:
# ============ PREPROCESSOR ====================

# For each feature, apply a log1p transformation
preprocessor_specs = {
    **{name: lambda t: tf.math.log1p(t * tf.sign(t)) * tf.sign(t)
       for name in example_spec.keys()}
}

In [None]:
# ============ SCORER ====================
scorer = tfr.keras.model.DNNScorer(
    hidden_layer_dims=[64, 32, 16],
    output_units=1,
    activation=tf.nn.relu,
    use_batch_norm=True)

In [None]:
# ============ MODEL STRUCTURE ====================
model_builder = tfr.keras.model.ModelBuilder(
    input_creator=input_creator,
    preprocessor=tfr.keras.model.PreprocessorWithSpec(preprocessor_specs),
    scorer=scorer,
    mask_feature_name="list_mask",
    name="model_builder",
)

In [None]:
# ======= DATASET  HYPERPARAMETERS ==========
combined_train_path = os.path.join(current_working_directory, "data", "combined","train.tfrecords")
combined_test_path = os.path.join(current_working_directory, "data", "combined","test.tfrecords")

dataset_hparams = tfr.keras.pipeline.DatasetHparams(
    train_input_pattern=combined_train_path,
    valid_input_pattern=combined_test_path,
    train_batch_size=32,
    valid_batch_size=32,
    list_size=50,
    dataset_reader=tf.data.TFRecordDataset)

# ======= DATASET BUILDER ==========
dataset_builder = tfr.keras.pipeline.SimpleDatasetBuilder(
    context_feature_spec,
    example_spec,
    mask_feature_name="list_mask",
    label_spec=label_spec,
    hparams=dataset_hparams)

In [None]:
# ======= MODEL HYPERPARAMETERS ==========
combined_path = os.path.join(current_working_directory, "data", "combined")

pipeline_hparams = tfr.keras.pipeline.PipelineHparams(
    model_dir=combined_path,
    num_epochs=5,
    steps_per_epoch=1000,
    validation_steps=100,
    learning_rate=0.05,
    loss="approx_ndcg_loss",
    strategy="MirroredStrategy")

In [None]:
# ======= RANKING PIPELINE ==========
ranking_pipeline = tfr.keras.pipeline.SimplePipeline(
    model_builder,
    dataset_builder=dataset_builder,
    hparams=pipeline_hparams)

In [None]:
# ======= TRAIN RANKING PIPELINE ==========
ranking_pipeline.train_and_validate(verbose=1)

In [None]:
# ======= RUN ON TEST SET ==========

def compute_ndcg(dataset, model):
    ndcg_metric = tfr.keras.metrics.NDCGMetric(name="ndcg_metric")
    for x, y in dataset:
        scores = model.predict(x)
        min_score = tf.reduce_min(scores)
        scores = tf.where(tf.greater_equal(y, 0.), scores, min_score - 1e-5)
        ndcg_metric.update_state(y_true=y, y_pred=scores)
    return ndcg_metric.result().numpy()

ds_test = dataset_builder.build_valid_dataset()

# Get input features from the first batch of the test data
for x, y in ds_test.take(1):
    break
    
loaded_model = tf.keras.models.load_model(f"{combined_path}/export/latest_model")

# Compute NDCG for the test set
ndcg_score = compute_ndcg(ds_test, loaded_model)
print("NDCG Score on Test Set: ", ndcg_score)