# *App Scraping*

**Load Packages**

In [None]:
pip install google-play-scraper

In [None]:
from google_play_scraper import Sort, reviews, app
import pandas as pd

In [None]:
import re

TAG_RE = re.compile(r'<[^>]+>')


def clean_text(text):
    text = TAG_RE.sub('', text)  # Remove HTML tags
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces, newlines, and tabs
    return text

In [None]:
import time

The following code uses the downloaded scraper to take app descriptions, number of downloads, and user reviews for 17 different dating apps from the Google Play Store.

 ***Tinder***

In [None]:
result_tinder, continuation_token = reviews(
    'com.tinder',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request


result_tinder, _ = reviews(
    'com.tinder',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_tinder_df = pd.DataFrame(result_tinder)
result_tinder_df = result_tinder_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1)
result_tinder_df['App'] = 'Tinder'

time.sleep(0.1) # pause before fetching app description

result_tinder_des = app(
    'com.tinder',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

tinder_description = clean_text(result_tinder_des['description'])
tinder_installs = result_tinder_des["realInstalls"]

***Hinge***

In [None]:
result_hinge, continuation_token = reviews(
    'co.hinge.app',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_hinge, _ = reviews(
    'co.hinge.app',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_hinge_df = pd.DataFrame(result_hinge)
result_hinge_df = result_hinge_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_hinge_df['App'] = 'Hinge' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_hinge_des = app(
    'co.hinge.app',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

hinge_description = clean_text(result_hinge_des['description'])
hinge_installs = result_tinder_des["realInstalls"]

***Bumble***

In [None]:
result_bumble, continuation_token = reviews(
    'com.bumble.app',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_bumble, _ = reviews(
    'com.bumble.app',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_bumble_df = pd.DataFrame(result_bumble)
result_bumble_df = result_bumble_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_bumble_df['App'] = 'Bumble' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_bumble_des = app(
    'com.bumble.app',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

bumble_description = clean_text(result_bumble_des['description'])
bumble_installs = result_bumble_des['realInstalls']

***Her***

In [None]:
result_her, continuation_token = reviews(
    'com.weareher.her',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_her, _ = reviews(
    'com.weareher.her',
    continuation_token=continuation_token # avoids returning duplicate reviews
)


result_her_df = pd.DataFrame(result_her) # make into df
result_her_df = result_her_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_her_df['App'] = 'Her' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_her_des = app(
    'com.weareher.her',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

her_description = clean_text(result_her_des['description'])
her_installs = result_her_des['realInstalls']

***Zoe***

In [None]:
result_zoe, continuation_token = reviews(
    'com.surgeapp.zoe',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_zoe, _ = reviews(
    'com.surgeapp.zoe',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_zoe_df = pd.DataFrame(result_zoe) # make into df
result_zoe_df = result_zoe_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_zoe_df['App'] = 'Zoe' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_zoe_des = app(
    'com.surgeapp.zoe',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

zoe_description = clean_text(result_zoe_des['description'])
zoe_installs = result_zoe_des['realInstalls']

***Grindr***

In [None]:
result_grindr, continuation_token = reviews(
    'com.grindrapp.android',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_grindr, _ = reviews(
    'com.grindrapp.android',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_grindr_df = pd.DataFrame(result_grindr) # make into df
result_grindr_df = result_grindr_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_grindr_df['App'] = 'Grindr' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_grindr_des = app(
    'com.grindrapp.android',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

grindr_description = clean_text(result_grindr_des['description'])
grindr_installs = result_grindr_des['realInstalls']

***SCRUFF***

In [None]:
result_scruff, continuation_token = reviews(
    'com.appspot.scruffapp',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_scruff, _ = reviews(
    'com.appspot.scruffapp',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_scruff_df = pd.DataFrame(result_scruff) # make into df
result_scruff_df = result_scruff_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_scruff_df['App'] = 'Scruff' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_scruff_des = app(
    'com.appspot.scruffapp',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

scruff_description = clean_text(result_scruff_des['description'])
scruff_installs = result_scruff_des['realInstalls']

***Taimi***

In [None]:
result_taimi, continuation_token = reviews(
    'com.takimi.android',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_taimi, _ = reviews(
    'com.takimi.android',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_taimi_df = pd.DataFrame(result_taimi) # make into df
result_taimi_df = result_taimi_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_taimi_df['App'] = 'Taimi' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_taimi_des = app(
    'com.takimi.android',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

taimi_description = clean_text(result_taimi_des['description'])
taimi_installs = result_taimi_des['realInstalls']

***Millionaire Match***

In [None]:
result_mil_match, continuation_token = reviews(
    'com.millionairedating.millionairematch',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_mil_match, _ = reviews(
    'com.millionairedating.millionairematch',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_mil_match_df = pd.DataFrame(result_mil_match) # make into df
result_mil_match_df = result_mil_match_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_mil_match_df['App'] = 'Millionaire Match' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_mil_match_des = app(
    'com.millionairedating.millionairematch',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

mil_match_description = clean_text(result_mil_match_des['description'])
mil_match_installs = result_mil_match_des['realInstalls']

***Luxy***

In [None]:
result_luxy, continuation_token = reviews(
    'com.luxy',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, # return 200 reviews
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_luxy, _ = reviews(
    'com.luxy',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_luxy_df = pd.DataFrame(result_luxy)
result_luxy_df = result_luxy_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_luxy_df['App'] = 'Luxy' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_luxy_des = app(
    'com.luxy',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)


luxy_description = clean_text(result_luxy_des['description'])
luxy_installs = result_scruff_des['realInstalls']

***The League***

In [None]:
result_league, continuation_token = reviews(
    'com.league.theleague',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200,
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_league, _ = reviews(
    'com.league.theleague',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_league_df = pd.DataFrame(result_league)
result_league_df = result_league_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_league_df['App'] = 'League' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_league_des = app(
    'com.league.theleague',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

league_description = clean_text(result_league_des['description'])
league_installs = result_league_des['realInstalls']

***Christian Mingle***

In [None]:
result_christian_mingle, continuation_token = reviews(
    'com.spark.christianmingle',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200,
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_christian_mingle, _ = reviews(
    'com.spark.christianmingle',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_christian_mingle_df = pd.DataFrame(result_christian_mingle)
result_christian_mingle_df = result_christian_mingle_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_christian_mingle_df['App'] = 'Christian_Mingle' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_christian_mingle_des = app(
    'com.spark.christianmingle',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

chris_mingle_description = clean_text(result_christian_mingle_des['description'])
chris_mingle_installs = result_christian_mingle_des['realInstalls']

***Muzz***

In [None]:
result_muzz, continuation_token = reviews(
    'com.muzmatch.muzmatchapp',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200,
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before fetching next request

result_muzz, _ = reviews(
    'com.muzmatch.muzmatchapp',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_muzz_df = pd.DataFrame(result_muzz)
result_muzz_df = result_muzz_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_muzz_df['App'] = 'Muzz' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_muzz_des = app(
    'com.muzmatch.muzmatchapp',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

muzz_description = clean_text(result_muzz_des['description'])
muzz_installs = result_muzz_des['realInstalls']

***Jswipe***

In [None]:
result_jswipe, continuation_token = reviews(
    'com.smooch.labs.jswipe',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200,
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_jswipe, _ = reviews(
    'com.smooch.labs.jswipe',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_jswipe_df = pd.DataFrame(result_jswipe)
result_jswipe_df = result_jswipe_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_jswipe_df['App'] = 'JSwipe' # create new column with app name

time.sleep(0.1) # pause before fetching app description

# create a new column with description
result_jswipe_des = app(
    'com.smooch.labs.jswipe',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

jswipe_description = clean_text(result_jswipe_des['description'])
jswipe_installs = result_jswipe_des['realInstalls']

***Mutual***

In [None]:
result_mutual, continuation_token = reviews(
    'com.mutualapp',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200, #
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next requset

result_mutual, _ = reviews(
    'com.mutualapp',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_mutual_df = pd.DataFrame(result_mutual)
result_mutual_df = result_mutual_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_mutual_df['App'] = 'Mutual' # create new column with app name

time.sleep(0.1) # pause before fetching app description

result_mutual_des = app(
    'com.mutualapp',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

mutual_description = clean_text(result_mutual_des['description'])
mutual_installs = result_mutual_des['realInstalls']

***Senior Match***

In [None]:
result_senior_match, continuation_token = reviews(
    'com.successfulmatch.seniormatchdating',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200,
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_senior_match, _ = reviews(
    'com.successfulmatch.seniormatchdating',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_senior_match_df = pd.DataFrame(result_senior_match)
result_senior_match_df = result_senior_match_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_senior_match_df['App'] = 'Senior_Match' # create new column with app name

time.sleep(0.1) # pause before fetching app description

result_senior_match_des = app(
    'com.successfulmatch.seniormatchdating',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

senior_match_description = clean_text(result_senior_match_des['description'])
senior_match_installs = result_senior_match_des["realInstalls"]

***Ourtime***

In [None]:
result_ourtime, continuation_token = reviews(
    'com.peoplemedia.ourtime',
    lang='en', # English reviews
    country='us', # reviews from US
    sort=Sort.NEWEST, # return newest reviews
    count=200,
    #filter_score_with=5 # we can filter for certain scores
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

time.sleep(0.1) # pause before next request

result_ourtime, _ = reviews(
    'com.peoplemedia.ourtime',
    continuation_token=continuation_token # avoids returning duplicate reviews
)

result_ourtime_df = pd.DataFrame(result_ourtime)
result_ourtime_df = result_ourtime_df.drop(['reviewId', 'userImage', 'reviewCreatedVersion'], axis=1) # remove columns
result_ourtime_df['App'] = 'Ourtime' # create new column with app name

time.sleep(0.1) # pause before fetching app description

result_ourtime_des = app(
    'com.peoplemedia.ourtime',
    lang='en', # defaults to 'en'
    country='us' # defaults to 'us'
)

ourtime_description = clean_text(result_ourtime_des['description'])
ourtime_installs = result_ourtime_des["realInstalls"]

# *Dataframe Creation*


This section compiles the scraped data from the different apps into two dataframes. The first dataframe "App Marketing" contains each apps description and number of downloads. The second dataframe "App Reviews" contains 200 user reviews from each dating app.

***App Marketing Dataframe***

In [None]:
app_data = {
    'App Name': ['Tinder',
                 'Bumble',
                 'Hinge',
                 'HER',
                 'Zoe',
                 'Grindr',
                 'SCRUFF',
                 'Taimi',
                 'Millionaire Match',
                 'Luxy',
                 'The League',
                 'Christian Mingle',
                 'Muzz',
                 'Jswipe',
                 'Mutual',
                 'Senior Match',
                 'Ourtime'],

    'Description': [tinder_description,
                    bumble_description,
                    hinge_description,
                    her_description,
                    zoe_description,
                    grindr_description,
                    scruff_description,
                    taimi_description,
                    mil_match_description,
                    luxy_description,
                    league_description,
                    chris_mingle_description,
                    muzz_description,
                    jswipe_description,
                    mutual_description,
                    senior_match_description,
                    ourtime_description],

    'Downloads': [tinder_installs,
                  bumble_installs,
                  hinge_installs,
                  her_installs,
                  zoe_installs,
                  grindr_installs,
                  scruff_installs,
                  taimi_installs,
                  mil_match_installs,
                  luxy_installs,
                  league_installs,
                  chris_mingle_installs,
                  muzz_installs,
                  jswipe_installs,
                  mutual_installs,
                  senior_match_installs,
                  ourtime_installs]
}

app_marketing_df = pd.DataFrame(app_data)

**App Reviews Dataframe**

In [None]:
app_reviews_df = pd.concat([result_tinder_df,
                            result_hinge_df,
                            result_bumble_df,
                            result_her_df,
                            result_zoe_df,
                            result_grindr_df,
                            result_scruff_df,
                            result_taimi_df,
                            result_mil_match_df,
                            result_luxy_df,
                            result_league_df,
                            result_christian_mingle_df,
                            result_muzz_df,
                            result_jswipe_df,
                            result_mutual_df,
                            result_senior_match_df,
                            result_ourtime_df],
                        ignore_index=True)

In [None]:
app_reviews_df.to_csv('app_reviews.csv', index=False)

# Creating Training Data

In [None]:
# grab only the first 6 reviews of each app for qualitative coding training dataset

def first_n_each_app(df, column_name, n):
    """
    Saves only the first n entries of each app in a DataFrame.
    """
    return df.groupby(column_name).head(n)

reviews_qual_coding_df = first_n_each_app(app_reviews_df, 'App', n=6)[['userName', 'content', 'App']]

In [None]:
# save df as excel file
reviews_qual_coding_df.to_excel("reviews_qual_coding_df.xlsx")

See this [link](https://docs.google.com/document/d/1NhzBAHmDzahXzocFSfMaU2BbAGm5frJBv2Z1Ao64u9U/edit?usp=sharing) to supplemental materials for access to our training data + more information our coding scheme. See this [link](https://colab.research.google.com/drive/1rRJ8CcOaPDsrmtZ15cTgKrQ9u2lrqNyf?usp=sharing) for the code we used to train GPT on our coding scheme and then instruct GPT to analyze our review text data and conduct the qualitative coding.

# *Data Analysis*

**Data Cleaning**

**NOTE 1:** We randomly selected 500 reviews and then trained GPT on our qualitative coding scheme to determine if each review contained a specific qualitative code (1) or didn't contain that code (0). Below, we uploaded the csv file of the dataframe containing information about if GPT determined if each qualitative code was present or not in each review. [Here](https://drive.google.com/file/d/1dKUAB1avzPZrwhm_2OsvV5U0GKsMovLT/view?usp=sharing) is a link to the csv file.

**NOTE 2:** 'App type' and 'app genre' will be used interchangeably

In [None]:
# load in the csv
import pandas as pd

coded_data = pd.read_csv('/content/app_reviews_coded_df.csv')

In [None]:
# function to create word count

def word_count(df, column_name, new_column_name='content_word_count'):
    """
    Calculates the word count for each row in a specified column of a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column containing text.
        new_column_name (str, optional): The name for the new word count column. Defaults to 'word_count'.

    Returns:
         pd.DataFrame: The DataFrame with an added column containing word counts.
    """
    df[new_column_name] = df[column_name].apply(lambda text: len(str(text).split()))
    return df

coded_data = word_count(coded_data, 'content')

In [None]:
# rename christian mingle and senior match and jswipe
coded_data['App'] = coded_data['App'].replace({'Christian_Mingle': 'Christian Mingle', 'Senior_Match': 'Senior Match'})

In [None]:
# create aa column stating each dating app's genre

# define mapping
app_mapping = {

    'Bumble': 'Mainstream',
    'Tinder': 'Mainstream',
    'Hinge': 'Mainstream',

    'Taimi': 'Queer Co-Ed',
    'Her': 'Queer Primarily Sapphic',
    'Zoe': 'Queer Primarily Sapphic',
    'Grindr': 'Queer Primarily Men',
    'Scruff': 'Queer Primarily Men',

    'Millionaire Match': 'Exclusive',
    'Luxy': 'Exclusive',
    'League': 'Exclusive',

    'JSwipe': 'Religious',
    'Mutual': 'Religious',
    'Christian Mingle': 'Religious',
    'Muzz': 'Religious',

    'Ourtime': 'Seniors',
    'Senior Match': 'Seniors'
}

# Create the new column using the map function
coded_data['App_Type'] = coded_data['App'].map(app_mapping)

In [None]:
coded_data.head()

In [None]:
## save a dataframe of the positive sentiment reviews

# boolean mask of only positive reviews
coded_data_pos = coded_data[coded_data['pos_sent_gpt'] == 1]

# save a dataframe of the total number of positive sentiment reviews for each app and app type
coded_data_pos_apps = coded_data_pos[['App', 'App_Type']].value_counts().reset_index()
coded_data_pos_apps.rename(columns={'count': 'pos_review_count'}, inplace=True)

# save a dataframe of the total number of reviews for each app
total_reviews = coded_data['App'].value_counts().reset_index()
total_reviews.rename(columns = {'count': 'review_count'}, inplace=True)

# merge the dataframes
coded_data_pos_apps = coded_data_pos_apps.merge(total_reviews, on='App')

# create a new column listing the proportion of positive sentiment reviews for each app and app type
coded_data_pos_apps['prop'] = coded_data_pos_apps['pos_review_count']/coded_data_pos_apps['review_count']
coded_data_pos_apps

In [None]:
## save a dataframe of the negative sentiment reviews

# boolean mask of only negative reviews
coded_data_neg = coded_data[coded_data['neg_sent_gpt'] == 1]

# save a dataframe of the total number of negative sentiment reviews for each app and app type
coded_data_neg_apps = coded_data_neg[['App', 'App_Type']].value_counts().reset_index()
coded_data_neg_apps.rename(columns={'count': 'neg_review_count'}, inplace=True)

# save a dataframe of the total number of reviews for each app
total_reviews = coded_data['App'].value_counts().reset_index()
total_reviews.rename(columns = {'count': 'review_count'}, inplace=True)

# merge the dataframes
coded_data_neg_apps = coded_data_neg_apps.merge(total_reviews, on='App')

# create a new column listing the proportion of negative sentiment reviews for each app and app type
coded_data_neg_apps['prop'] = coded_data_neg_apps['neg_review_count']/coded_data_neg_apps['review_count']
coded_data_neg_apps

In [None]:
## create bar plot for proportion of reviews with negative sentiment reviews across apps
import plotly.express as px

# assign each dating app genre a color
color_map = {'Mainstream': 'red',
             'Queer Primarily Sapphic': 'pink',
             'Queer Primarily Men': 'blue',
             'Queer Co-Ed': 'purple',
             'Exclusive': 'orange',
             'Religious': 'green',
             'Seniors': 'turquoise'}

fig1 = px.bar(coded_data_neg_apps, x="App", y="prop", color="App_Type",
             title="Proportion of Reviews With Negative Sentiment Reviews Per App",
              # fix the order app genres appear in legend for easy comparison between plots
             category_orders={'App_Type': ['Queer Primarily Sapphic', 'Queer Co-Ed', 'Exclusive', 'Religious', 'Seniors', 'Mainstream', 'Queer Prmarily Men']},
             color_discrete_map=color_map,
             labels = {'App': 'App Name', 'prop': 'Proportion of Reviews', 'App_Type': 'Dating App Genre'})

# fix the order app names appear on x-axis for easy comparison between plots
fig1.update_layout(xaxis={'categoryorder':'array',
                         'categoryarray':['Zoe', 'Her', 'Taimi', 'Luxy', 'Millionaire Match', 'League', 'Christian Mingle', 'JSwipe', 'Mutual', 'Muzz', 'Ourtime', 'Senior Match', 'Tinder', 'Bumble', 'Hinge', 'Scruff', 'Grindr']})

fig1.update_yaxes(range=[0, 1])

fig1.show()

## create bar plot for proportion of reviews with positive sentiment reviews across apps
fig = px.bar(coded_data_pos_apps, x="App", y="prop", color="App_Type",
             color_discrete_map=color_map,
             title="Proportion of Reviews With Positive Sentiment Reviews Per App",
                           # fix the order app genres appear in legend for easy comparison between plots
             category_orders={'App_Type': ['Queer Primarily Sapphic', 'Queer Co-Ed', 'Exclusive', 'Religious', 'Seniors', 'Mainstream', 'Queer Prmarily Men']},
             labels = {'App': 'App Name', 'prop': 'Proportion of Reviews', 'App_Type': 'Dating App Genre'})

# fix the order app names appear on x-axis for easy comparison between plots
fig.update_layout(xaxis={'categoryorder':'array',
                         'categoryarray':['Zoe', 'Her', 'Taimi', 'Luxy', 'Millionaire Match', 'League', 'Christian Mingle', 'JSwipe', 'Mutual', 'Muzz', 'Ourtime', 'Senior Match', 'Tinder', 'Bumble', 'Hinge', 'Scruff', 'Grindr']})

fig.update_yaxes(range=[0, 1])

fig.show()

**How much do people write when they love or hate an app?**

In [None]:
## create a line plot of the average word count for each rating

# create a dataframe listing the average word count for each rating (# stars given)
rating_wc_df = coded_data.groupby('score')['content_word_count'].mean().reset_index()

# round every average word count by two decimal points for readibility
rating_wc_df['average_word_count_rounded'] = rating_wc_df['content_word_count'].round(2)

# create line plot
fig = px.line(rating_wc_df, x="score", y="content_word_count",
              text = 'average_word_count_rounded',
              title='App Reviews Average Word Count Per Rating', markers = True)

# position point labels in top right corner for readability
fig.update_traces(textposition='top right')

# remove decimal points on x axis
fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5])

# label axes
fig.update_layout(
    xaxis_title="Rating (# of Stars)",
    yaxis_title="Average Word Count"
)

# adjust range on y-axis for readability
fig.update_yaxes(range=[0, 37])

fig.show()

In [None]:
## create a line plot of the average word count for each rating per app type

# create a dataframe listing the average word count for each rating for each app type
rating_wc_app_df = coded_data.groupby(['App_Type', 'score'])['content_word_count'].mean().reset_index()

# create line plot
fig = px.line(rating_wc_app_df, x="score", y="content_word_count", color = "App_Type",
              color_discrete_map=color_map,
              title='App Reviews Average Word Count Per Rating', markers = True,
              category_orders={'App_Type': ['Queer Primarily Sapphic', 'Queer Co-Ed', 'Exclusive', 'Religious', 'Seniors', 'Mainstream', 'Queer Prmarily Men']},
              labels = {'App_Type': 'Dating App Genre'})

# position point labels in top right corner for readability
fig.update_traces(textposition='top right')

# remove decimal points on x axis
fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5])

# label axes
fig.update_layout(
    xaxis_title="Rating (# of Stars)",
    yaxis_title="Average Word Count"
)

fig.show()

**What topics do most people write about in their reviews?**

In [None]:
## create a dataframe containing info about each code and the frequency of each code

# create a column containing the names of each code
code_names = ['emojis', 'pos_ux', 'neg_ux', 'pos_ppl', 'neg_ppl', 'price', 'fraud', 'ban']

# create a column containing a brief description of each code
definitions = ['The review contains emojis',
               'The review discusses positive user experience',
               'The review discusses negative user experience',
               'The review discusses positive social interactions with other app users',
               'The review discusses negative social interactions with other app users',
               'The review unhappy with monetization of the app',
               'The review discusses encountering fraudulent accounts on the app',
               'The review discusses getting their accounts banned']

# calculate the frequency of each code
prop_emojis = coded_data['emojis_gpt'].sum() / len(coded_data)
prop_pos_ux = coded_data['pos_ux_gpt'].sum() / len(coded_data)
prop_neg_ux = coded_data['neg_ux_gpt'].sum() / len(coded_data)
prop_pos_ppl = coded_data['pos_ppl_gpt'].sum() / len(coded_data)
prop_neg_ppl = coded_data['neg_ppl_gpt'].sum() / len(coded_data)
prop_price = coded_data['price_gpt'].sum() / len(coded_data)
prop_fraud = coded_data['fraud_gpt'].sum() / len(coded_data)
prop_ban = coded_data['ban_gpt'].sum() / len(coded_data)

# create a column containing the frequency of each code  (proportion of reviews that qualify for this code)
proportions = [prop_emojis, prop_pos_ux, prop_neg_ux, prop_pos_ppl, prop_neg_ppl, prop_price, prop_fraud, prop_ban]

# combine the columns together to create a dataframe
coded_data_prop = pd.DataFrame({'Code': code_names, 'Code Definition': definitions, 'Proportion': proportions})
coded_data_prop

In [None]:
## save a dataframe of reviews with price code

# save a dataframe of the total number of reviews qualitfying for the price code for each app type
coded_data_price = coded_data[coded_data['price_gpt'] == 1][['App_Type']].value_counts().reset_index()
coded_data_price.rename(columns={'count': 'price_review_count'}, inplace=True)

# save a dataframe of the total number of reviews for each app type
total_type_reviews = coded_data['App_Type'].value_counts().reset_index()
total_type_reviews.rename(columns = {'count': 'review_count'}, inplace=True)

# merge the dataframes
coded_data_price = coded_data_price.merge(total_type_reviews, on='App_Type')

# create a new column listing the proportion of reviews qualifying for the price code for each app type
coded_data_price['prop'] = coded_data_price['price_review_count']/coded_data_price['review_count']

coded_data_price

In [None]:
## create bar plot for proportion of reviews qualifying for the price code across app genres

# create bar plot
fig = px.bar(coded_data_price, x="App_Type", y="prop", color="App_Type", text_auto = ".2f",
             color_discrete_map=color_map,
             title="Proportion of Reviews Unhappy With App Monetization",
             labels = {'App_Type': 'Dating App Genre', 'prop': 'Proportion of Reviews', 'App_Type': 'Dating App Genre'})

# fix order app types appear on x axis for readability
fig.update_layout(xaxis={'categoryorder':'array',
                         'categoryarray':['Queer Primarily Sapphic', 'Queer Co-Ed', 'Exclusive', 'Religious', 'Seniors', 'Mainstream']})

# remove legend to remove redundant information
fig.update_layout(showlegend=False)

# fix range on y axis for readability
fig.update_yaxes(range=[0, 1])

fig.show()

In [None]:
## save a dataframe of reviews with negative people

# save a dataframe of the total number of reviews qualitfying for the negative people  code for each app type
coded_data_neg_ppl = coded_data[coded_data['neg_ppl_gpt'] == 1][['App_Type']].value_counts().reset_index()
coded_data_neg_ppl.rename(columns={'count': 'neg_ppl_review_count'}, inplace=True)

# save a dataframe of the total number of reviews for each app type
total_type_reviews = coded_data['App_Type'].value_counts().reset_index()
total_type_reviews.rename(columns = {'count': 'review_count'}, inplace=True)

# merge the dataframes
coded_data_neg_ppl = coded_data_neg_ppl.merge(total_type_reviews, on='App_Type')

# create new column listing the proportion of reviews qualifying for the negative people code for each app type
coded_data_neg_ppl['prop'] = coded_data_neg_ppl['neg_ppl_review_count']/coded_data_neg_ppl['review_count']

coded_data_neg_ppl

In [None]:
## create bar plot for proportion of reviews qualifying for the negative people code across app genres

# create bar plot
fig = px.bar(coded_data_neg_ppl, x="App_Type", y="prop", color="App_Type", text_auto = ".2f",
             color_discrete_map=color_map,
             title="Proportion of Reviews About Negative Social Interactions With People From App",
             labels = {'App_Type': 'Dating App Genre', 'prop': 'Proportion of Reviews', 'App_Type': 'Dating App Genre'})

# fix order app types appear on x axis for readability
fig.update_layout(xaxis={'categoryorder':'array',
                         'categoryarray':['Queer Primarily Sapphic', 'Queer Co-Ed', 'Exclusive', 'Religious', 'Seniors', 'Mainstream']})

# remove legend to remove redundant information
fig.update_layout(showlegend=False)

# fix range on y axis for readability
fig.update_yaxes(range=[0, 1])

fig.show()

# *Clustering Apps*

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

***Description + Ad Spread Coding***

Each app's marketing spread was manually coded using qualitative content analysis producing an excel spreadsheet. Here this dataframe is merged with the dataframe containing the apps description and number of downloads. The resulting dataframe will be used to do a KMeans clustering analysis to understand which apps might have most similar marketing styles.  

The following code first cleans the dataframes in prepartion for the merge.

ad_spreads_df = pd.read_csv("/content/ad_spreads_qual_coding_df.csv")

In [None]:
ad_spreads_df["# models smiling on title slide "] = (ad_spreads_df["# models smiling on title slide "] > 0).astype(int)
ad_spreads_df["# models not smiling on title slide "] = (ad_spreads_df["# models not smiling on title slide "] > 0).astype(int)
ad_spreads_df["# of white models on title slide"] = (ad_spreads_df["# of white models on title slide"] > 0).astype(int)
ad_spreads_df["# of POC or racially ambiguous models on title slide"] = (ad_spreads_df["# of POC or racially ambiguous models on title slide"] > 0).astype(int)
ad_spreads_df["# models fully clothed on title slide"] = (ad_spreads_df["# models fully clothed on title slide"] > 0).astype(int)
ad_spreads_df["# models not fully clothed on title slide"] = (ad_spreads_df["# models not fully clothed on title slide"] > 0).astype(int)
ad_spreads_df["App Name"][6] = "SCRUFF"

In [None]:
app_des_ad_merge = ad_spreads_df.merge(app_marketing_df, on="App Name")

In [None]:
app_des_ad_merge.to_csv('app_des_ad_merge.csv', index=False)

In [None]:
text_feature = "Description"
one_hot_features = [
       'Bright Colors (does the color pallete look more neon, bright, loud? 1 = yes, 0 = no)',
       'Muted Colors (does the color pallete look more softer and neutral? 1 = yes, 0 = no)',
       'Lowercase (is the text all lowercase or mostly lowercase, such as starting with a capitalized letter and only lower case following it? 1 = yes, 0 = no)',
       'Upper Case (is the text in all caps or mostly all caps? 1 = yes, 0 = no) ',
       'Title Case (is the text using more formal capitalization, such as only capitalizing key words as one would do with an academic book title? 1 = yes, 0 = no)',
       'Serifs (does the text used serifed font? 1 = yes, 0 = no)',
       'Sans Serif (does the text use font without serifs? 1 = yes, 0 = no) ',
       '# models smiling on title slide ',
       '# models not smiling on title slide ',
       '# of white models on title slide',
       '# of POC or racially ambiguous models on title slide',
       '# models fully clothed on title slide',
       '# models not fully clothed on title slide']

In [None]:
preprocessor = make_column_transformer(
    (TfidfVectorizer(max_features=100), text_feature),
    (StandardScaler(), one_hot_features),
    remainder="drop"
)

Here within-cluster sum of squared distances (inertia) was evaluated to determine the optimal number of clusters using the Elbow Method.

In [None]:
inertias = []

for n_clusters in range(1, 17):
    des_ad_pipeline = make_pipeline(
        preprocessor,
        KMeans(n_clusters=n_clusters, random_state=112)
    )
    des_ad_pipeline.fit(app_des_ad_merge)

    kmeans = des_ad_pipeline.named_steps["kmeans"]
    inertias.append(kmeans.inertia_)

# Plot the elbow method graph
plt.figure(figsize=(17, 6))
plt.plot(range(1, 17), inertias, marker='o')
plt.title('Optimal Number of Clusters (Using Elbow Method)')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.xticks(range(1, 17))
plt.show()

The elbow graph shows that 4 is the optimal number of clusters for our data.

In [None]:
des_ad_pipeline = make_pipeline(
    preprocessor,
    KMeans(n_clusters=4, random_state=42)
)

des_ad_pipeline.fit(app_des_ad_merge)
des_ad_model = des_ad_pipeline.named_steps["kmeans"]

The following print outs show the different cluster groups.

In [None]:
for doc in app_des_ad_merge["App Name"][des_ad_model.labels_ == 0]:
  print(doc)
  print("--------")

In [None]:
for doc in app_des_ad_merge["App Name"][des_ad_model.labels_ == 1]:
  print(doc)
  print("--------")

In [None]:
for doc in app_des_ad_merge["App Name"][des_ad_model.labels_ == 2]:
  print(doc)
  print("--------")

In [None]:
for doc in app_des_ad_merge["App Name"][des_ad_model.labels_ == 3]:
  print(doc)
  print("--------")

We chose a heatmap to visualize the presence of different features in different clusters. The following code cleans the dataframes and produces a heatmap.

In [None]:
app_des_ad_merge['Bright Colors (1 = yes, 0 = no)'] = app_des_ad_merge['Bright Colors (does the color pallete look more neon, bright, loud? 1 = yes, 0 = no)']
app_des_ad_merge['Muted Colors (1 = yes, 0 = no)'] = app_des_ad_merge['Muted Colors (does the color pallete look more softer and neutral? 1 = yes, 0 = no)']
app_des_ad_merge['Lowercase (1 = yes, 0 = no)'] = app_des_ad_merge['Lowercase (is the text all lowercase or mostly lowercase, such as starting with a capitalized letter and only lower case following it? 1 = yes, 0 = no)']
app_des_ad_merge['Upper Case (1 = yes, 0 = no)'] = app_des_ad_merge['Upper Case (is the text in all caps or mostly all caps? 1 = yes, 0 = no) ']
app_des_ad_merge['Title Case (1 = yes, 0 = no)'] = app_des_ad_merge['Title Case (is the text using more formal capitalization, such as only capitalizing key words as one would do with an academic book title? 1 = yes, 0 = no)']
app_des_ad_merge['Serifs (1 = yes, 0 = no)'] = app_des_ad_merge['Serifs (does the text used serifed font? 1 = yes, 0 = no)']
app_des_ad_merge['Sans Serif (1 = yes, 0 = no)'] = app_des_ad_merge['Sans Serif (does the text use font without serifs? 1 = yes, 0 = no) ']
app_des_ad_merge['Contains models smiling (1 = yes, 0 = no)'] = app_des_ad_merge['# models smiling on title slide ']
app_des_ad_merge['Contains models not smiling (1 = yes, 0 = no)'] = app_des_ad_merge['# models not smiling on title slide ']
app_des_ad_merge['Contains white models (1 = yes, 0 = no)'] = app_des_ad_merge['# of white models on title slide']
app_des_ad_merge['Contains POC or racially ambiguous models (1 = yes, 0 = no)'] = app_des_ad_merge['# of POC or racially ambiguous models on title slide']
app_des_ad_merge['Contains fully clothed models (1 = yes, 0 = no)'] = app_des_ad_merge['# models fully clothed on title slide']
app_des_ad_merge['Contains not fully clothed models (1 = yes, 0 = no)'] = app_des_ad_merge['# models not fully clothed on title slide']

In [None]:
app_des_ad_merge.drop(
    columns=['Downloads',
             'Bright Colors (does the color pallete look more neon, bright, loud? 1 = yes, 0 = no)',
             'Muted Colors (does the color pallete look more softer and neutral? 1 = yes, 0 = no)',
             'Lowercase (is the text all lowercase or mostly lowercase, such as starting with a capitalized letter and only lower case following it? 1 = yes, 0 = no)',
             'Upper Case (is the text in all caps or mostly all caps? 1 = yes, 0 = no) ',
             'Title Case (is the text using more formal capitalization, such as only capitalizing key words as one would do with an academic book title? 1 = yes, 0 = no)',
             'Serifs (does the text used serifed font? 1 = yes, 0 = no)',
             'Sans Serif (does the text use font without serifs? 1 = yes, 0 = no) ',
             '# models smiling on title slide ',
             '# models not smiling on title slide ',
             '# of white models on title slide',
             '# of POC or racially ambiguous models on title slide',
             '# models fully clothed on title slide',
             '# models not fully clothed on title slide'], inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Manually assign clusters
cluster_mapping = {
    "Grindr": 1, "SCRUFF": 1,
    "Tinder": 2, "Bumble": 2, "HER": 2, "Zoe": 2, "Taimi": 2, "Christian Mingle": 2, "Muzz": 2, "Jswipe": 2, "Mutual": 2, "Ourtime": 2,
    "Hinge": 3,
    "Millionaire Match": 4, "Luxy": 4, "The League": 4, "Senior Match": 4
}

app_des_ad_merge["Cluster"] = app_des_ad_merge["App Name"].map(cluster_mapping)
app_des_ad_merge = app_des_ad_merge.dropna(subset=["Cluster"])  # Remove rows without assigned clusters
app_des_ad_merge["Cluster"] = app_des_ad_merge["Cluster"].astype(int)

# Group by cluster and calculate mean values
cluster_means = app_des_ad_merge.groupby("Cluster").mean(numeric_only=True)

# Transpose the dataframe so that clusters are on the x-axis and features are on the y-axis
plt.figure(figsize=(10, 8))  # Adjust figure size for better readability
sns.heatmap(cluster_means.T, annot=True, cmap="coolwarm", linewidths=0.5)

plt.title("Feature Averages by Cluster", fontsize=14)
plt.xlabel("Cluster", fontsize=12)
plt.ylabel("Feature", fontsize=12)

plt.show()