In [1]:
import pandas as pd

# Method 1: If your datasets are in separate TSV files
def stack_tsv_files(file_paths, output_path):
    """
    Stack multiple TSV files with the same structure
    
    Args:
        file_paths: List of file paths to TSV files
        output_path: Path where to save the combined dataset
    """
    dataframes = []
    
    # Read each TSV file (tab-separated)
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t')
        dataframes.append(df)
    
    # Stack all dataframes vertically
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save to new file (as TSV or CSV)
    if output_path.endswith('.tsv'):
        combined_df.to_csv(output_path, sep='\t', index=False)
    else:
        combined_df.to_csv(output_path, index=False)
    print(f"Combined dataset saved to {output_path}")
    print(f"Total rows: {len(combined_df)}")
    return combined_df

In [2]:
file_paths = [
    '../data/fastvlm_rlhf_results_0_to_2500.tsv',
    '../data/fastvlm_rlhf_results_2500_to_5000.tsv',
]
combined_data = stack_tsv_files(file_paths, 'fastvlm_rlhf_combined.tsv')

Combined dataset saved to fastvlm_rlhf_combined.tsv
Total rows: 5000


In [3]:
# Optional: Preview the result
print("\nFirst few rows of combined dataset:")
print(combined_data.head())
print(f"\nDataset shape: {combined_data.shape}")
print(f"Columns: {list(combined_data.columns)}")

# Optional: Check for any issues
print(f"\nMissing values per column:")
print(combined_data.isnull().sum())


First few rows of combined dataset:
   index                                            prompt1  \
0      0  The image depicts a rural scene with several p...   
1      1  The image depicts a narrow kitchen space with ...   
2      2  The image depicts a young girl with blonde hai...   
3      3  The image depicts a bathroom scene with a focu...   
4      4  The image depicts a compact, utilitarian bathr...   

                                             prompt2                prompt3  \
0                                       Answer: Yes.    The answer is male.   
1  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0                   0.0.   
2                                        Answer: Yes  The answer is female.   
3                                                  0    The answer is male.   
4                                        Answer: No.           Answer: male   

                 prompt4  
0  A boy. Answer: a boy.  
1           Answer: male  
2  The answer is female.  
3