In [None]:
import pandas as pd

# Method 1: If your datasets are in separate TSV files
def stack_tsv_files(file_paths, output_path):
    """
    Stack multiple TSV files with the same structure
    
    Args:
        file_paths: List of file paths to TSV files
        output_path: Path where to save the combined dataset
    """
    dataframes = []
    
    # Read each TSV file (tab-separated)
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t')
        dataframes.append(df)
    
    # Stack all dataframes vertically
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save to new file (as TSV or CSV)
    if output_path.endswith('.tsv'):
        combined_df.to_csv(output_path, sep='\t', index=False)
    else:
        combined_df.to_csv(output_path, index=False)
    print(f"Combined dataset saved to {output_path}")
    print(f"Total rows: {len(combined_df)}")
    return combined_df

In [None]:
file_paths = [
    '../data/raw_tsv_files/smolvlm_m2_vti_results_0_to_2500.tsv',
    '../data/raw_tsv_files/smolvlm_m2_vti_results_2500_to_5000.tsv',
]
combined_data = stack_tsv_files(file_paths, 'smolvlm_m2_vti.tsv')

In [None]:
# Optional: Preview the result
print("\nFirst few rows of combined dataset:")
print(combined_data.head())
print(f"\nDataset shape: {combined_data.shape}")
print(f"Columns: {list(combined_data.columns)}")

# Optional: Check for any issues
print(f"\nMissing values per column:")
print(combined_data.isnull().sum())