# Import Packages

In [None]:
# reference: https://www.kaggle.com/code/akshayraman/dating-app-trends-2013-2022
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import re
import tensorflow
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# Load Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DSO 560 NLP Team Project/DatingAppReviewsDataset.csv',index_col=0)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

## Market Share of Apps

Let us determine the market share of each app.

> **Note:** Actual Market share should be calculated based on the number of installs, daily active users etc. However the number of reviews could be used as a rough estimate of App usage.

In [None]:
# calculate number of reviews each Apps
user_reviews_per_app = df['App'].value_counts().to_frame().reset_index().rename(columns={"index": "App", "App": "Number of Reviews"})


# plot
app_color_map = {
    "Tinder": "#fe3c72",
    "Bumble": "#ffcb37",
    "Hinge": "#854DF8"
}

fig = px.pie(user_reviews_per_app,
             values='Number of Reviews',
             names='App',
             title='% Share - User Reviews',
             color_discrete_sequence=list(app_color_map.values()),
             labels={"App": "Application"}
            )
fig.update_layout(
    font=dict(
        family='"Open Sans", verdana, arial, sans-serif',
        size=14
    ),
    title_x=0.5
)

fig.update_traces(textinfo="label+percent", textposition="inside")
fig.show()

## Market Share over the years

Let us examine how the market share has changed over the years.

In [None]:
# calculate number of reviews each Apps for every two years
df['Date&Time'] = pd.to_datetime(df['Date&Time'], format="%d-%m-%Y %H:%M")
bins=[2013,2015,2017,2019,2021,2022]
labels=['2013-2014', '2015-2016', '2017-2018','2019-2020','2021-2022']
data_gb_year = df.groupby(by=pd.cut(df['Date&Time'].dt.year, bins=bins, labels=labels))

# plot
rows = 2
cols = 3

lst = list(data_gb_year)

subplot_titles = [l[0] for l in lst]

specs = [[{'type':'domain'}] * cols] * rows

fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles,
        specs=specs)

for i, l in enumerate(lst):
    # basic math to get col and row
    row = i // cols + 1
    col = i % cols +1
    # this is the dataframe for every continent
    d = l[1]['App'].value_counts().to_frame().reset_index().rename(columns={"index": "App", "App": "Number of Reviews"})
    fig.add_trace(
        go.Pie(labels=d["App"],
               values=d["Number of Reviews"],
               showlegend=True,
               textposition='inside',
               textinfo='label+percent',
              marker_colors=list(app_color_map.values())),
         row=row,
         col=col
    )
    
fig.update_layout(title="Market Share over the years", title_x=0.5
                 )
fig.show()