In [None]:
import pandas as pd

def process_multiple_csv_files(input_files, output_files):
    analyzer = SentimentIntensityAnalyzer()

    for input_file, output_file in zip(input_files, output_files):
        print(f"Processing file: {input_file}")

        try:
            # Loading the data, skipping bad lines
            df = pd.read_csv(
                input_file,
                on_bad_lines='skip',
                encoding='utf-8',
                encoding_errors='ignore'
            )

            # Ensuring numeric types for scores
            df['score_submission'] = pd.to_numeric(df['score_submission'], errors='coerce')
            df['score_comment'] = pd.to_numeric(df['score_comment'], errors='coerce')

            # Flitering threads with less than 5 comments
            valid_threads = df['thread_id'].value_counts()
            threads_with_5_or_more_comments = valid_threads[valid_threads >= 5].index
            filtered_df = df[df['thread_id'].isin(threads_with_5_or_more_comments)]

            # Getting the top 10 threads by "score_submission"
            top_threads = (
                filtered_df[['thread_id', 'score_submission']]
                .drop_duplicates('thread_id')
                .nlargest(10, 'score_submission')
                .sort_values(by='score_submission', ascending=False)  # Sorting threads by score_submission
            )

            top_thread_ids = top_threads['thread_id'].tolist()


            # Filtering rows to include only comments from the top 10 threads
            top_thread_comments = filtered_df[filtered_df['thread_id'].isin(top_thread_ids)]
            top_thread_comments['thread_id'] = pd.Categorical(
    top_thread_comments['thread_id'], categories=top_thread_ids, ordered=True
)


            # Getting the top 5 comments per thread by "score_comment"
            top_comments_per_thread = (
                top_thread_comments.sort_values(by=['thread_id', 'score_comment'], ascending=[True, False])
                .groupby('thread_id')
                .head(5)

            )
            print(top_comments_per_thread)


            # Saving the result to a CSV file
            top_comments_per_thread.to_csv(output_file, index=False, encoding='utf-8', lineterminator='\n')
            print(f"Saved to {output_file}")
        except Exception as e:
            print(f"Error processing {input_file}: {e}")


input_files = [

    '/content/2020_depression_linked_llama_gemma_qwen.csv',
    '/content/2020_suicide_linked_llama_gemma_qwen.csv',
    '/content/2023_depression_linked_llama_gemma_qwen.csv',
    '/content/2023_suicide_linked_llama_gemma_qwen.csv',
    '/content/2024_depression_linked_llama_gemma_qwen.csv',
    '/content/2024_suicide_linked_llama_gemma_qwen.csv'





]
output_files = [
    '2020_depression_human_output.csv',
    '2020_suicide_human_output.csv',
    '2023_depression_human_output.csv',
    '2023_suicide_human_output.csv',
    '2024_depression_human_output.csv',
    '2024_suicide_human_output.csv'

]

process_multiple_csv_files(input_files, output_files)