In [5]:
import pandas as pd

# Import the files
iOS_files = [
    '../iOS/run_1_results/analyzed_replika_reviews_final.csv',
    '../iOS/run_2_results/analyzed_replika_reviews_final.csv',
    '../iOS/run_3_results/analyzed_replika_reviews_final.csv'
]

android_files = [
    '../Android/run_1_results/analyzed_reviews_final.csv',
    '../Android/run_2_results/analyzed_reviews_final.csv',
    '../Android/run_3_results/analyzed_reviews_final.csv'
]

iOS_column_names = {'user':'user_name', 'date': 'review_date', 'stars': 'star_rating', 'text': 'review_text'}
iOS_data_frames = [pd.read_csv(file).rename(columns=iOS_column_names) for file in iOS_files]

android_columns_names = {'User Name':'user_name', 'Review Date': 'review_date', 'Star Rating': 'star_rating', 'Review Text': 'review_text'}
android_data_frames = [pd.read_csv(file).rename(columns=android_columns_names) for file in android_files]

data_frames = iOS_data_frames + android_data_frames
filtered_dfs = []

for df in data_frames:
    temp = df.query("coherence_and_clarity_of_review in ['High', 'Medium']")
    temp = temp.query("effect_of_ai_on_suicidal_thoughts != 'Not Mentioned'")
    temp = temp[["user_name", "review_date", "review_text", "star_rating", "effect_of_ai_on_suicidal_thoughts"]]
    temp['review_date'] = pd.to_datetime(temp['review_date']).dt.normalize()
    filtered_dfs.append(temp)

results_df = pd.concat(filtered_dfs, ignore_index=True)
results_df['dedupe_key'] = results_df['review_text'].str[:50]
results_df = results_df.drop_duplicates(subset='dedupe_key')
results_df.drop(['dedupe_key'], axis=1, inplace=True)
results_df['year'] = results_df['review_date'].dt.year
results_df.to_csv('suicide_reviews.csv', index=False)

In [7]:
import pandas as pd

iOS_reviews_full = pd.read_csv('../full_review_sets/iOS_reviews_full.csv')
android_reviews_full = pd.read_csv('../full_review_sets/replika_reviews_full.csv')

iOS_reviews_full['date'] = pd.to_datetime(iOS_reviews_full['date']).dt.normalize()
iOS_reviews_full['year'] = iOS_reviews_full['date'].dt.year
android_reviews_full['date'] = pd.to_datetime(android_reviews_full['Review Date']).dt.normalize()
android_reviews_full['year'] = android_reviews_full['date'].dt.year

suicide_words = ['suicide', "kill myself", "kms"]
suicide_regex = r'\b(?:' + '|'.join(suicide_words) + r')\b'

short_reviews = iOS_reviews_full['text'].str.split().apply(len) <= 50
iOS_reviews_shorter_than_50_words = iOS_reviews_full[short_reviews]

reviews_with_suicide_terms = (iOS_reviews_shorter_than_50_words['text']
    .str
    .contains(suicide_regex, case=False, regex=True, na=False)
)

iOS_short_suicide_reviews_to_analyze = iOS_reviews_shorter_than_50_words[reviews_with_suicide_terms]
iOS_short_suicide_reviews_to_analyze.to_csv('iOS_short_suicide_reviews_to_analyze.csv', index=False)

short_reviews = android_reviews_full['Review Text'].str.split().apply(len) <= 50
android_reviews_shorter_than_50_words = android_reviews_full[short_reviews]

reviews_with_suicide_terms = (android_reviews_shorter_than_50_words['Review Text']
    .str
    .contains(suicide_regex, case=False, regex=True, na=False)
)

android_short_suicide_reviews_to_analyze = android_reviews_shorter_than_50_words[reviews_with_suicide_terms]
android_short_suicide_reviews_to_analyze.to_csv('android_short_suicide_reviews_to_analyze.csv', index=False)


In [16]:
iOS_reviews_full.drop_duplicates(subset='user').shape

(37533, 5)

In [5]:
android_reviews_full.shape

(23019, 7)

In [15]:
android_reviews_full.groupby('year').year.value_counts()

year
2018     326
2019    1897
2020    8699
2021    4035
2022    3490
2023    3724
2024     848
Name: count, dtype: int64

In [10]:
iOS_reviews_full.groupby('year').year.value_counts()

year
2017    17463
2018      643
2019     2017
2020    12611
2021     7623
2022     7115
2023     5867
2024    21261
Name: count, dtype: int64

In [18]:
print(1897 + 2017)
print(8699 + 12611)

print(23019 + 37533)

3914
21310
60552
