In [5]:
import pandas as pd

# Method 1: If your datasets are in separate TSV files
def stack_tsv_files(file_paths, output_path):
    """
    Stack multiple TSV files with the same structure
    
    Args:
        file_paths: List of file paths to TSV files
        output_path: Path where to save the combined dataset
    """
    dataframes = []
    
    # Read each TSV file (tab-separated)
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t')
        dataframes.append(df)
    
    # Stack all dataframes vertically
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save to new file (as TSV or CSV)
    if output_path.endswith('.tsv'):
        combined_df.to_csv(output_path, sep='\t', index=False)
    else:
        combined_df.to_csv(output_path, index=False)
    print(f"Combined dataset saved to {output_path}")
    print(f"Total rows: {len(combined_df)}")
    return combined_df

In [6]:
file_paths = [
    '../data/smolvlm_rlhf_results_0_to_2500.tsv',
    '../data/smolvlm_rlhf_results_2500_to_5000.tsv',
]
combined_data = stack_tsv_files(file_paths, 'smolvlm_rlhf_combined.tsv')

Combined dataset saved to smolvlm_rlhf_combined.tsv
Total rows: 5000


In [7]:
# Optional: Preview the result
print("\nFirst few rows of combined dataset:")
print(combined_data.head())
print(f"\nDataset shape: {combined_data.shape}")
print(f"Columns: {list(combined_data.columns)}")

# Optional: Check for any issues
print(f"\nMissing values per column:")
print(combined_data.isnull().sum())


First few rows of combined dataset:
   index                                            prompt1  \
0      0  In the image we can see a person holding a blu...   
1      1  This image is taken indoors. In the foreground...   
2      2  A girl is holding a small cat in her arms. The...   
3      3  In the image, there is a toilet seat, a toilet...   
4      4  In the image, there is a room with a floor, tw...   

                                 prompt2  prompt3  prompt4  
0                                   Yes.  Female.  Female.  
1  No, there is no person in this image.  Female.  Female.  
2                                   Yes.  Female.  Female.  
3  No, there is no person in this image.  Female.  Female.  
4  No, there is no person in this image.  Female.  Female.  

Dataset shape: (5000, 5)
Columns: ['index', 'prompt1', 'prompt2', 'prompt3', 'prompt4']

Missing values per column:
index      0
prompt1    0
prompt2    0
prompt3    0
prompt4    0
dtype: int64
