In [1]:
import pandas as pd

df1 =  pd.read_csv('gpt_processed_reviews/android/run_1_replika_reviews.csv', keep_default_na=False)
df2 =  pd.read_csv('gpt_processed_reviews/android/run_1_replika_reviews.csv', keep_default_na=False)
df3 =  pd.read_csv('gpt_processed_reviews/android/run_1_replika_reviews.csv', keep_default_na=False)

### Make sure the three seperate files have the same columns. There were a few corner cases causing a need for this.

In [2]:
columns = ['User Name', 'Review Date', 'Star Rating',
       'Review Text', 'Response Date', 'Response Text', 'echoed_user_name',
       'coherence_and_clarity_of_review', 'gender_of_user', 'gender_of_ai',
       'name_user_gave_ai', 'age_of_user', 'duration_of_app_usage',
       'frequency_of_app_usage', 'relationship_status_of_user',
       'empathy_of_ai', 'behavior_of_ai', 'inappropriate_frequency',
       'inappropriate_nature', 'ai_support_level', 'support_types',
       'user_mental_state_before_ai', 'effect_of_ai_on_user_mental_state',
       'stress_before_ai', 'effect_of_ai_on_stress', 'loneliness_before_ai',
       'effect_of_ai_on_loneliness', 'depression_or_anxiety_before_ai',
       'effect_of_ai_on_depression_or_anxiety', 'suicidal_thoughts_presence',
       'effect_of_ai_on_suicidal_thoughts', 'other_despair_types',
       'effect_of_ai_on_other_despair', 'user_dependence',
       'real_life_relationship_impact', 'limitations_of_ai',
       'technical_issues', 'privacy_concerns', 'feature_restriction_impact',
       'cost_impact_on_accessibility', 'impact_of_ai_updates',
       'user_satisfaction_with_policy_decisions',
       'overall_mental_health_impact_of_company_decisions']

df1 = df1[columns]
df2 = df2[columns]
df3 = df3[columns]

df1['short_review_text'] = df1['Review Text'].apply(lambda text: ' '.join(text.split()[:10]))
df1['key'] = df1['User Name'] + df1['Review Date'] + df1['short_review_text']

df2['short_review_text'] = df2['Review Text'].apply(lambda text: ' '.join(text.split()[:10]))
df2['key'] = df2['User Name'] + df2['Review Date'] + df2['short_review_text']

df3['short_review_text'] = df3['Review Text'].apply(lambda text: ' '.join(text.split()[:10]))
df3['key'] = df3['User Name'] + df3['Review Date'] + df3['short_review_text']


common_keys = set(df1['key']).intersection(df2['key']).intersection(df3['key'])
df1 = df1[df1['key'].isin(common_keys)].drop_duplicates(subset='key')
df2 = df2[df2['key'].isin(common_keys)].drop_duplicates(subset='key')
df3 = df3[df3['key'].isin(common_keys)].drop_duplicates(subset='key')

df1.drop(['short_review_text'], axis=1, inplace=True)
df2.drop(['short_review_text'], axis=1, inplace=True)
df3.drop(['short_review_text'], axis=1, inplace=True)

### Majority voting

In [3]:
import pandas as pd
from collections import Counter
from itertools import chain
import random


# Columns to apply majority voting on
vote_columns = ['support_types', 'other_despair_types', 'limitations_of_ai']
single_value_columns = [col for col in df1.columns if col not in vote_columns and col != 'key']

# Prepare a new DataFrame to store the results
all_keys = df1['key'].unique()
final_df = pd.DataFrame(all_keys, columns=['key'])
final_df.set_index('key', inplace=True)

for key in all_keys:
    user_data = []
    for df in [df1, df2, df3]:
        data = df[df['key'] == key]
        if not data.empty:
            user_data.append(data.iloc[0])

    # Implementing majority voting for single value columns
    for col in single_value_columns:
        # Filter out 'None' and 'nan' explicitly treated as a string
        values = [ud[col] for ud in user_data if pd.notnull(ud[col]) and ud[col] not in [None, 'nan', '']]
        if values:
            # Count the most common value that is neither None nor 'nan'
            most_common_value = Counter(values).most_common(1)[0][0]
            final_df.at[key, col] = most_common_value
        else:
            # If no valid values, check for any available non-None and non-'nan' values
            available_values = [ud[col] for ud in user_data if pd.notnull(ud[col]) and ud[col] not in [None, 'nan', '']]
            if available_values:
                final_df.at[key, col] = random.choice(available_values)
            else:
                # Assign pd.NA if no valid values are available
                final_df.at[key, col] = pd.NA

    # Majority voting for multi-value columns
    for col in vote_columns:
        # Filter out None and 'nan' explicitly treated as strings
        all_values = list(chain(*[str(ud[col]).split(', ') for ud in user_data if ud[col] not in [None, 'nan', '']]))
        if all_values:
            value_counts = Counter(all_values)
            selected_values = [value for value, count in value_counts.items() if count > 1]
            final_df.at[key, col] = ', '.join(selected_values)

# Ensuring no duplicates of 'key' exist before resetting the index
if 'key' in final_df.columns:
    final_df.drop(columns=['key'], inplace=True)

# Reset the index to move 'key' from index to a column
final_df.reset_index(inplace=True)

# Ensure all 'nan' strings and None values are treated uniformly as missing values
final_df.replace({'nan': pd.NA, None: pd.NA}, inplace=True)

# Save the DataFrame
final_df.to_csv('combined_reviews_final.csv', index=False)
