In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime, timedelta
import itertools
import os
os.environ["OMP_NUM_THREADS"] = "4"

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
df = pd.read_csv('agg_time_results_fbp_coop.csv')

In [4]:
df['model_name'] = df['model_name'].replace('tars_xmt_final_sum', 'txmt')

In [5]:
df.head()

Unnamed: 0,model_name,split_type,max_days,pred_length,avg_f1_score,avg_precision,avg_recall,avg_hit_score,avg_training_time_seconds,avg_pred_time_seconds
0,top,10,0,2,0.095092,0.403906,0.054511,0.646001,0.098021,1.5e-05
1,top,10,0,3,0.13197,0.396776,0.080422,0.781773,0.098021,8e-06
2,top,10,0,4,0.158209,0.379727,0.101963,0.83819,0.098021,7e-06
3,top,10,0,5,0.183876,0.373094,0.124989,0.890267,0.098021,7e-06
4,top,10,0,6,0.207522,0.369085,0.148404,0.921265,0.098021,7e-06


In [6]:
import pandas as pd
import numpy as np

def generate_latex_table(df, split_type, pred_lengths, metric):
    """
    Generate a LaTeX table comparing model performances across different prediction lengths and max_days.
    
    Parameters:
    df (pd.DataFrame): Input dataframe
    split_type (int): Split type to filter for (10, 20, ..., 90)
    pred_lengths (list): List of prediction lengths to include
    metric (str): Metric to compare ('f1_score', 'precision', or 'recall')
    """
    # Map metric names to DataFrame columns
    metric_map = {
        'f1_score': 'avg_f1_score',
        'precision': 'avg_precision',
        'recall': 'avg_recall'
    }
    
    # Filter data
    filtered_df = df[df['split_type'] == split_type]
    filtered_df = filtered_df[filtered_df['pred_length'].isin(pred_lengths)]
    
    # Get unique model names
    models = sorted(filtered_df['model_name'].unique())
    
    # Create table rows
    table_rows = []
    
    # Generate rows for each prediction length and max_days combination
    for k in sorted(pred_lengths):
        for max_day in [0, 1, 2]:
            # Get data for this combination
            condition_data = filtered_df[
                (filtered_df['pred_length'] == k) & 
                (filtered_df['max_days'] == max_day)
            ]
            
            # Create row header
            if max_day == 0:
                row_header = f'k={int(k)}'
            else:
                row_header = ''
            
            # Add max_days information
            row = [row_header, f'd={max_day}']
            
            # Get all values for this row to determine the maximum
            row_values = []
            for model in models:
                value = condition_data[condition_data['model_name'] == model][metric_map[metric]].values
                if len(value) > 0:
                    row_values.append(value[0])
                else:
                    row_values.append(float('-inf'))
            
            row_max = max(row_values)
            
            # Add values for each model
            for i, model in enumerate(models):
                value = condition_data[condition_data['model_name'] == model][metric_map[metric]].values
                if len(value) > 0:
                    formatted_value = f'{value[0]:.3f}'
                    if abs(value[0] - row_max) < 1e-10:
                        formatted_value = f'\\textbf{{{formatted_value}}}'
                else:
                    formatted_value = '-'
                row.append(formatted_value)
            
            table_rows.append(' & '.join(row) + r' \\')
            
            # Add \cline after each pred_length group, now starting from column 1
            if max_day == 2:
                table_rows.append('\\cline{1-' + str(len(models) + 2) + '}')
    
    # Create column specification with smaller column spacing
    col_spec = r'@{}l@{\hspace{4pt}}|@{\hspace{4pt}}l|' + r'@{\hspace{4pt}}c@{\hspace{4pt}}' * len(models) + r'@{}'
    
    # Create header rows with split type and model names
    split_type_header = f'\\multicolumn{{{len(models) + 2}}}{{c}}{{Split Type = {split_type}\\%}} \\\\'
    model_header = '& & ' + ' & '.join(models) + r' \\'
    
    # Create the LaTeX table with smaller font and tighter spacing
    latex_table = f"""\\begin{{table}}[h]
\\centering
\\small
\\setlength{{\\tabcolsep}}{{4pt}}
\\begin{{tabular}}{{{col_spec}}}
\\hline
{split_type_header}
\\hline
{model_header}
\\hline
{chr(10).join(table_rows)}
\\hline
\\end{{tabular}}
\\caption{{{metric.replace('_', ' ').title()} comparison for different prediction lengths (k) and days (d)}}
\\label{{tab:comparison_{metric}_{split_type}}}
\\end{{table}}
"""
    
    # Save to file
    filename = f'table_{metric}_{split_type}.tex'
    with open(filename, 'w') as f:
        f.write(latex_table)
    
    print(f"Table saved to {filename}")
    return latex_table

# Example usage:
# df = pd.read_csv('your_data.csv')
# generate_latex_table(df, split_type=90, pred_lengths=[5.0, 10.0, 15.0, 20.0], metric='f1_score')

In [9]:
# Example usage:

generate_latex_table(
    df=df,
    split_type=70,
    pred_lengths=[5.0, 10.0, 15.0, 20.0],
    metric='f1_score'
)

generate_latex_table(
    df=df,
    split_type=70,
    pred_lengths=[5.0, 10.0, 15.0, 20.0],
    metric='recall'
)


Table saved to table_f1_score_70.tex
Table saved to table_recall_70.tex


'\\begin{table}[h]\n\\centering\n\\small\n\\setlength{\\tabcolsep}{4pt}\n\\begin{tabular}{@{}l@{\\hspace{4pt}}|@{\\hspace{4pt}}l|@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{}}\n\\hline\n\\multicolumn{13}{c}{Split Type = 70\\%} \\\\\n\\hline\n& & clf & fpmc & hrm & ibp & last & markov & nmf & tbp & top & txmt & xmt \\\\\n\\hline\nk=5 & d=0 & 0.095 & 0.018 & 0.110 & 0.094 & 0.035 & 0.123 & 0.126 & 0.125 & 0.124 & \\textbf{0.170} & 0.168 \\\\\n & d=1 & 0.095 & 0.015 & 0.085 & 0.085 & 0.037 & 0.125 & 0.122 & 0.124 & 0.124 & \\textbf{0.159} & 0.155 \\\\\n & d=2 & 0.094 & 0.015 & 0.079 & 0.083 & 0.037 & 0.123 & 0.123 & 0.123 & 0.123 & \\textbf{0.157} & 0.153 \\\\\n\\cline{1-13

### Time table

In [10]:
import pandas as pd
import numpy as np

def generate_latex_table(df, split_type, pred_lengths, metric_type='performance', metric='f1_score'):
    """
    Generate a LaTeX table comparing model performances or timing metrics across different prediction lengths and max_days.
    
    Parameters:
    df (pd.DataFrame): Input dataframe
    split_type (int): Split type to filter for (10, 20, ..., 90)
    pred_lengths (list): List of prediction lengths to include
    metric_type (str): Type of metric to compare ('performance' or 'time')
    metric (str): Specific metric to compare:
        - If metric_type='performance': 'f1_score', 'precision', or 'recall'
        - If metric_type='time': 'training_time' or 'pred_time'
    """
    # Map metric names to DataFrame columns
    performance_metrics = {
        'f1_score': 'avg_f1_score',
        'precision': 'avg_precision',
        'recall': 'avg_recall'
    }
    
    time_metrics = {
        'training_time': 'avg_training_time_seconds',
        'pred_time': 'avg_pred_time_seconds'
    }
    
    # Select appropriate metric mapping
    metric_map = time_metrics if metric_type == 'time' else performance_metrics
    if metric not in metric_map:
        raise ValueError(f"Invalid metric '{metric}' for metric_type '{metric_type}'")
    
    # Filter data
    filtered_df = df[df['split_type'] == split_type]
    filtered_df = filtered_df[filtered_df['pred_length'].isin(pred_lengths)]
    
    # Get unique model names
    models = sorted(filtered_df['model_name'].unique())
    
    # Create table rows
    table_rows = []
    
    # Generate rows for each prediction length and max_days combination
    for k in sorted(pred_lengths):
        for max_day in [0, 1, 2]:
            # Get data for this combination
            condition_data = filtered_df[
                (filtered_df['pred_length'] == k) & 
                (filtered_df['max_days'] == max_day)
            ]
            
            # Create row header
            if max_day == 0:
                row_header = f'k={int(k)}'
            else:
                row_header = ''
            
            # Add max_days information
            row = [row_header, f'd={max_day}']
            
            # Get all values for this row to determine the minimum (for time metrics) or maximum (for performance metrics)
            row_values = []
            for model in models:
                value = condition_data[condition_data['model_name'] == model][metric_map[metric]].values
                if len(value) > 0:
                    row_values.append(value[0])
                else:
                    row_values.append(float('inf') if metric_type == 'time' else float('-inf'))
            
            # For time metrics, we want to highlight the minimum value
            # For performance metrics, we want to highlight the maximum value
            row_best = min(row_values) if metric_type == 'time' else max(row_values)
            
            # Add values for each model
            for i, model in enumerate(models):
                value = condition_data[condition_data['model_name'] == model][metric_map[metric]].values
                if len(value) > 0:
                    # Format time values differently
                    if metric_type == 'time':
                        formatted_value = f'{value[0]:.3f}'
                        if abs(value[0] - row_best) < 1e-10:
                            formatted_value = f'\\textbf{{{formatted_value}}}'
                    else:
                        formatted_value = f'{value[0]:.3f}'
                        if abs(value[0] - row_best) < 1e-10:
                            formatted_value = f'\\textbf{{{formatted_value}}}'
                else:
                    formatted_value = '-'
                row.append(formatted_value)
            
            table_rows.append(' & '.join(row) + r' \\')
            
            # Add \cline after each pred_length group
            if max_day == 2:
                table_rows.append('\\cline{1-' + str(len(models) + 2) + '}')
    
    # Create column specification with smaller column spacing
    col_spec = r'@{}l@{\hspace{4pt}}|@{\hspace{4pt}}l|' + r'@{\hspace{4pt}}c@{\hspace{4pt}}' * len(models) + r'@{}'
    
    # Create header rows with split type and model names
    split_type_header = f'\\multicolumn{{{len(models) + 2}}}{{c}}{{Split Type = {split_type}\\%}} \\\\'
    model_header = '& & ' + ' & '.join(models) + r' \\'
    
    # Create metric description for caption
    if metric_type == 'time':
        metric_desc = 'Training Time' if metric == 'training_time' else 'Prediction Time'
        metric_desc += ' (in seconds)'
    else:
        metric_desc = metric.replace('_', ' ').title()
    
    # Create the LaTeX table with smaller font and tighter spacing
    latex_table = f"""\\begin{{table}}[h]
\\centering
\\small
\\setlength{{\\tabcolsep}}{{4pt}}
\\begin{{tabular}}{{{col_spec}}}
\\hline
{split_type_header}
\\hline
{model_header}
\\hline
{chr(10).join(table_rows)}
\\hline
\\end{{tabular}}
\\caption{{{metric_desc} comparison for different prediction lengths (k) and days (d)}}
\\label{{tab:comparison_{metric_type}_{metric}_{split_type}}}
\\end{{table}}
"""
    
    # Save to file
    filename = f'table_{metric_type}_{metric}_{split_type}.tex'
    with open(filename, 'w') as f:
        f.write(latex_table)
    
    print(f"Table saved to {filename}")
    return latex_table

# Example usage:
df = df
# # For performance metrics:
# generate_latex_table(df, split_type=90, pred_lengths=[5.0, 10.0, 15.0, 20.0], metric_type='performance', metric='f1_score')
# # For timing metrics:
generate_latex_table(df, split_type=70, pred_lengths=[5.0, 10.0, 15.0, 20.0], metric_type='time', metric='training_time')

Table saved to table_time_training_time_70.tex


'\\begin{table}[h]\n\\centering\n\\small\n\\setlength{\\tabcolsep}{4pt}\n\\begin{tabular}{@{}l@{\\hspace{4pt}}|@{\\hspace{4pt}}l|@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{}}\n\\hline\n\\multicolumn{13}{c}{Split Type = 70\\%} \\\\\n\\hline\n& & clf & fpmc & hrm & ibp & last & markov & nmf & tbp & top & txmt & xmt \\\\\n\\hline\nk=5 & d=0 & 19.197 & 0.362 & 150.771 & 0.198 & \\textbf{0.044} & 0.220 & 0.357 & 9.460 & 0.051 & 595.684 & 0.653 \\\\\n & d=1 & 154.077 & 3.350 & 2046.357 & 1.538 & \\textbf{0.505} & 2.284 & 42.659 & 8.820 & 0.535 & 2197.969 & 5.852 \\\\\n & d=2 & 188.839 & 3.416 & 2434.319 & 1.753 & \\textbf{0.530} & 3.105 & 47.549 & 7.585 & 0.568 & 3486.571 & 6

In [11]:
generate_latex_table(df, split_type=70, pred_lengths=[5.0, 10.0, 15.0, 20.0], metric_type='time', metric='pred_time')

Table saved to table_time_pred_time_70.tex


'\\begin{table}[h]\n\\centering\n\\small\n\\setlength{\\tabcolsep}{4pt}\n\\begin{tabular}{@{}l@{\\hspace{4pt}}|@{\\hspace{4pt}}l|@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{\\hspace{4pt}}c@{\\hspace{4pt}}@{}}\n\\hline\n\\multicolumn{13}{c}{Split Type = 70\\%} \\\\\n\\hline\n& & clf & fpmc & hrm & ibp & last & markov & nmf & tbp & top & txmt & xmt \\\\\n\\hline\nk=5 & d=0 & 0.004 & 0.000 & 0.000 & 0.000 & \\textbf{0.000} & 0.000 & 0.000 & 0.113 & 0.000 & 0.000 & 0.000 \\\\\n & d=1 & 0.004 & 0.000 & 0.000 & 0.000 & \\textbf{0.000} & 0.000 & 0.000 & 0.148 & 0.000 & 0.000 & 0.000 \\\\\n & d=2 & 0.004 & 0.000 & 0.000 & 0.000 & \\textbf{0.000} & 0.000 & 0.000 & 0.141 & 0.000 & 0.000 & 0.000 \\\\\n\\cline{1-13