In [8]:
import pandas as pd

# Method 1: If your datasets are in separate TSV files
def stack_tsv_files(file_paths, output_path):
    """
    Stack multiple TSV files with the same structure
    
    Args:
        file_paths: List of file paths to TSV files
        output_path: Path where to save the combined dataset
    """
    dataframes = []
    
    # Read each TSV file (tab-separated)
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t')
        dataframes.append(df)
    
    # Stack all dataframes vertically
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save to new file (as TSV or CSV)
    if output_path.endswith('.tsv'):
        combined_df.to_csv(output_path, sep='\t', index=False)
    else:
        combined_df.to_csv(output_path, index=False)
    print(f"Combined dataset saved to {output_path}")
    print(f"Total rows: {len(combined_df)}")
    return combined_df

In [9]:
file_paths = [
    '../data/smolvlm_m2_a02_vti_results_0_to_2500.tsv',
    '../data/smolvlm_m2_a02_vti_results_2500_to_5000.tsv',
]
combined_data = stack_tsv_files(file_paths, 'smolvlm_m2_vti.tsv')

Combined dataset saved to smolvlm_m2_vti.tsv
Total rows: 5000


In [10]:
# Optional: Preview the result
print("\nFirst few rows of combined dataset:")
print(combined_data.head())
print(f"\nDataset shape: {combined_data.shape}")
print(f"Columns: {list(combined_data.columns)}")

# Optional: Check for any issues
print(f"\nMissing values per column:")
print(combined_data.isnull().sum())


First few rows of combined dataset:
   index                                            prompt1 prompt2 prompt3  \
0      0  In this image we can see a person holding an u...    Yes.   Male.   
1      1           A kitchen with a white door and a stove.     No.   Male.   
2      2               A girl is holding a cat in her arms.    Yes.   GIRL.   
3      3  In this picture we can see a toilet, bottle, r...     No.   Male.   
4      4  In this image we can see a washroom. There are...     No.   Male.   

   prompt4  
0    Male.  
1  Female.  
2    GIRL.  
3  Female.  
4    Male.  

Dataset shape: (5000, 5)
Columns: ['index', 'prompt1', 'prompt2', 'prompt3', 'prompt4']

Missing values per column:
index      0
prompt1    0
prompt2    0
prompt3    0
prompt4    0
dtype: int64
