In [None]:
import os
import pandas as pd
from pathlib import Path

In [12]:
def multiple_folders_to_csv(folders_dict, output_csv='reviews_with_folders.csv'):
    """
    Convert text files from multiple folders to a CSV with folder name column.
    
    Args:
        folders_dict (dict): Dictionary with {folder_name: folder_path}
        output_csv (str): Name of the output CSV file
    """
    all_data = []
    
    for folder_name, folder_path in folders_dict.items():
        folder = Path(folder_path)
        
        if not folder.exists():
            print(f"Warning: Folder {folder_path} does not exist. Skipping.")
            continue
            
        text_files = list(folder.glob('*.txt'))
        print(f"Found {len(text_files)} text files in {folder_name}")
        
        for file_path in text_files:
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()
                
                if content:
                    all_data.append({
                        # 'filename': file_path.name,
                        'review_text': content,
                        'sentiment': folder_name,
                    })
                    
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    
    # Create DataFrame and save as CSV
    if all_data:
        df = pd.DataFrame(all_data)
        df.to_csv(output_csv, index=False, encoding='utf-8')
        print(f"Successfully created {output_csv} with {len(df)} reviews from {len(folders_dict)} folders")
        print(f"Folder distribution:")
        print(df['folder_name'].value_counts())
    else:
        print("No data found to process")

In [13]:
# Usage Example 1: Using dictionary
if __name__ == "__main__":
    folders = {
        'neg': '/home/ilya/Downloads/archive/dataset/neg',
        'neu': '/home/ilya/Downloads/archive/dataset/neu', 
        'pos': '/home/ilya/Downloads/archive/dataset/pos'
    }
    
    multiple_folders_to_csv(folders, 'all_reviews.csv')

Found 19827 text files in neg
Found 24704 text files in neu
Found 87138 text files in pos
Successfully created all_reviews.csv with 131669 reviews from 3 folders
Folder distribution:


KeyError: 'folder_name'