In [75]:
import pandas as pd
from collections import Counter
from itertools import chain
import random

df1 = pd.read_csv('gpt_processed_reviews/iOS/run_1_replika_reviews.csv', keep_default_na=False)
df2 = pd.read_csv('gpt_processed_reviews/iOS/run_1_replika_reviews.csv', keep_default_na=False)
df3 = pd.read_csv('gpt_processed_reviews/iOS/run_1_replika_reviews.csv', keep_default_na=False)

# Prepare a list of all unique users
all_users = pd.concat([df1['user'], df2['user'], df3['user']]).unique()

# Columns to apply majority voting on (adjust according to your dataset's structure)
vote_columns = ['support_types', 'other_despair_types', 'limitations_of_ai']
single_value_columns = [col for col in df1.columns if col not in vote_columns and col != 'user']

# Prepare a new DataFrame to store the results
final_df = pd.DataFrame(columns=df1.columns)
final_df['user'] = all_users
final_df = final_df.set_index('user')

for user in all_users:
    user_data = []
    for df in [df1, df2, df3]:
        data = df.loc[df['user'] == user]
        if not data.empty:
            user_data.append(data.iloc[0])
    
    for col in single_value_columns:
        values = [row[col] for row in user_data if pd.notnull(row[col]) and str(row[col]) != 'nan']
        if values:
            most_common_value = Counter(values).most_common(1)[0][0]
            final_df.at[user, col] = most_common_value
        else:
            # Randomly assign one of the available values if no majority vote
            available_values = [row[col] for row in user_data if pd.notnull(row[col]) and str(row[col]) != 'nan']
            if available_values:
                final_df.at[user, col] = random.choice(available_values)
            else:
                final_df.at[user, col] = pd.NA  # Assign NaN if no available values

    # Majority vote for multi-value columns
    for col in vote_columns:
        all_values = list(chain(*[str(row[col]).split(', ') for row in user_data if row[col] not in [None, 'nan', '']]))
        value_counts = Counter(all_values)
        # Select values that appear in at least 2 out of 3 lists
        selected_values = [value for value, count in value_counts.items() if count > 1]
        final_df.at[user, col] = ', '.join(selected_values)

# Reset index to add 'user' back as a column
final_df.reset_index(inplace=True)

# Handle conversion of 'nan' strings back to NaN values if necessary
final_df = final_df.replace('nan', pd.NA)

# Save the combined and processed DataFrame
final_df.to_csv('combined_reviews_final.csv', index=False)


