## Reproduction

In [6]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from plotnine import (
    ggplot, aes, geom_point, geom_segment, geom_text, geom_label,
    theme_minimal, theme, element_text, element_blank, element_rect, labs, scale_color_manual,
    coord_cartesian, scale_x_continuous, options
)
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")


In [7]:
# import the whole dataset
usage_all = pd.read_excel('../data/Social Media Usage_pivoted.xlsx')
# filter data (find the data with dimension = "Age")
usage_by_age = usage_all[usage_all['Dimension'] == 'Age']
# usage_by_age.head()
# check the data type
# print('data types:',usage_by_age.dtypes)
 # check values
# print(usage_by_age['Percentage'].unique())
## convert data
def convert_percentage(value):
    if value == '<1':
        return 0
    try:
        return float(value)
    except ValueError:
        return None  
usage_by_age['Percentage'] = usage_by_age['Percentage'].apply(convert_percentage)
usage_by_age['Percentage'].unique()
usage_by_age.head()

# Clean Data 
# Standardize platform names 
usage_by_age['Platform'] = usage_by_age['Platform'].replace({
    'Tik Tok': 'TikTok',
    'Twitter (X)': 'Twitter',
    'You Tube': 'YouTube'
})

# Standardize age group name
usage_by_age['Category'] = usage_by_age['Category'].replace({'Ages 18-29': '18-29'})

# Calculate the difference between the usage percentages for the age groups 18-29 and 65+
age_gap_diff = usage_by_age.pivot(index='Platform', columns='Category', values='Percentage')
age_gap_diff['Youngest - oldest DIFF'] = age_gap_diff['18-29'] - age_gap_diff['65+']
# Prepare the data
age_gap_diff = age_gap_diff.reset_index()

# drop "Be Real"
age_gap_diff = age_gap_diff[age_gap_diff['Platform']!='Be Real']
# age_gap_diff

platforms = age_gap_diff['Platform'].unique()
age_groups = ['18-29', '30-49', '50-64', '65+']

# Define age group colors to match the target visualization
age_colors = {
    '18-29': '#003366',  # Dark blue
    '30-49': '#336699',  # Mid blue
    '50-64': '#99CC99',  # Light green
    '65+': '#99CC33'     # Dark green
}


# Melt the DataFrame into a long format suitable for plotnine
df_long = age_gap_diff.melt(
    id_vars=['Platform', 'Youngest - oldest DIFF'],
    value_vars=age_groups,
    var_name='Age Group',
    value_name='Percentage'
)

# Ensure the age groups are ordered correctly
df_long['Age Group'] = pd.Categorical(df_long['Age Group'], categories=age_groups, ordered=True)



In [8]:
# Create a scatter plot for each age group
fig = px.scatter(
    df_long,
    x='Percentage',
    y='Platform',
    color='Age Group',
    color_discrete_map=age_colors,
    labels={'Percentage': 'Percentage(%)'},
    hover_data={'Percentage': True, 'Platform': True, 'Age Group': True},
    size_max=10  # Set a maximum marker size if needed
)

# Add lines connecting the minimum and maximum percentages for each platform
for platform in df_long['Platform'].unique():
    platform_data = df_long[df_long['Platform'] == platform]
    # Get all valid percentages for the platform
    percentages = platform_data['Percentage'].dropna()
    if len(percentages) >= 2:
        min_percentage = percentages.min()
        max_percentage = percentages.max()
        # Add a line connecting the min and max percentages
        fig.add_trace(go.Scatter(
            x=[min_percentage, max_percentage],
            y=[platform, platform],
            mode='lines',
            line=dict(color='gray', width=7),
            opacity=0.15,
            showlegend=False
        ))

# Add percentage labels above each point
for i, row in df_long.iterrows():
    fig.add_annotation(
        x=row['Percentage'],
        y=row['Platform'],
        text=f"{int(row['Percentage'])}%",
        showarrow=False,
        yshift=10,  # Adjust vertical position of the label
        font=dict(size=12, color=age_colors[row['Age Group']])
    )

# Add difference value labels on the right side with gray background
for i, row in age_gap_diff.iterrows():
    diff_value = row['Youngest - oldest DIFF']
    diff_text = f"+{int(diff_value)}" if not np.isnan(diff_value) else 'N/A'
    fig.add_annotation(
        x=105,
        y=row['Platform'],
        text=diff_text,
        showarrow=False,
        xanchor='left',
        bgcolor='lightgray',
        font=dict(size=10, color='black')
    )

# Update layout to match the desired style
fig.update_layout(
    title='Age Gaps in Social Media Usage Across Platforms',
    xaxis_title='Percentage of U.S. adults in each age group who say they ever use',
    xaxis=dict(range=[0, 110], tickvals=np.arange(0, 101, 20)),
    yaxis_title='',
    plot_bgcolor='white',
    legend_title_text='Age Groups',
    legend_traceorder='reversed',
    width=1000,
    height=800
)

# Remove gridlines and adjust margins
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_layout(margin=dict(l=100, r=200, t=100, b=100))

# Show the plot
fig.show()