In [1]:
# library
import numpy as np
import pandas as pd
import plotly.express as px

import json

# site_list = [ "cnn", "foxnews", "washingtontimes", "newsweek", "dailybeast"]
site_list = {
    "right": "foxnews",
    "lean_right": "washingtontimes",
    "center": "newsweek",
    "lean_left": "cnn",
    "left": "dailybeast"
}

## Functions

In [8]:
def open_json(file_path):
    df_json = pd.read_json(file_path, lines=True)
    return df_json

In [94]:
def graph_timeline(dataframes, site_list):
    # Prepare a list to collect data from all sites
    all_data = []

    # Process each site's DataFrame for weekly data
    # for tendency, site in site_list.items():
    #     df = dataframes[site]
    #     # Assuming 'wayback_time' is already a datetime column
    #     df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
    #     # df['year_month'] = df['wayback_time'].dt.strftime('%Y-%m') # add another column for month
    #     df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')
    #     df_weekly = df.groupby('week_start_date')['title'].count().reset_index()
    #     df_weekly['Source'] = site
    #     df_weekly['Tendency'] = tendency
    #     all_data.append(df_weekly)

    # Process each site's DataFrame for monthly data
    for tendency, site in site_list.items():
        df = dataframes[site]
        # Assuming 'wayback_time' is already a datetime column
        # df['year_week'] = df['wayback_time'].dt.strftime('%Y-%U')
        df['year_month'] = df['wayback_time'].dt.strftime('%Y-%m') # add another column for month
        # df['week_start_date'] = pd.to_datetime(df['year_week'] + '-0', format='%Y-%U-%w')
        df_monthly = df.groupby('year_month')['title'].count().reset_index()
        df_monthly['Source'] = site
        df_monthly['Tendency'] = tendency
        all_data.append(df_monthly)

    # Combine all data into a single DataFrame
    combined_df = pd.concat(all_data)

    # Plotting
    fig = px.line(
        combined_df,
        # x='week_start_date',
        x = 'year_month',
        y='title',
        color='Source',
        line_group='Source',
        labels={'week_start_date': 'Week Starting', 'title': 'Number of Articles', 'Tendency': 'Political Tendency'},
        title='Number of Articles per Week by Political Tendency',
        template='seaborn'
    )

    fig.update_layout(
        xaxis_title='Year Time Line (unit: month)',
        yaxis_title='Number of Articles',
        title_x=0.5,
        legend_title='News Source',
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=1.2),
        width=1000,
        # yaxis=dict(range=[0, 2000])
    )

    fig.show()

In [92]:
import pandas as pd
import plotly.express as px

def graph_wing_ratio(dataframes, site_list):
    all_data = []
    for tendency, site in site_list.items():
        df = dataframes[site]
        df_simple = df[['title']].copy()
        df_simple['Tendency'] = tendency
        df_simple['Source'] = site
        df_simple['Label'] = site + " (" + tendency + ")"  # Combine source and tendency into a label
        all_data.append(df_simple)

    # Combine all data into a single DataFrame
    combined_df = pd.concat(all_data)

    # Aggregate the data
    article_counts = combined_df.groupby(['Label', 'Tendency']).size().reset_index(name='Counts')

    # Plotting a pie chart
    fig = px.pie(
        article_counts,
        values='Counts',
        names='Label',  # Use the combined label for pie slice labels
        color='Tendency',  # Use Tendency for color distinction
        title='Number of Articles by Source and Political Tendency',
        template='seaborn',
        # color_discrete_sequence=px.colors.qualitative.Bold,

        hole=0.3  # Optional: to create a donut-like pie chart
    )
    
    # Update layout for clarity
    fig.update_traces(textposition='inside', textinfo='percent+label+value')
    fig.update_layout(
        legend_title='Political Tendency',
        legend=dict(yanchor="bottom", y=0.01, xanchor="right", x=1.15),
        width=800
    )
    
    fig.show()


## Main Codes


In [60]:
dataframes = {}
# load data
for _, site_name in site_list.items():
    file_path = f"./data/articles_all/json/uncategorized/articles{site_name}.json"
    try:
        df = open_json(file_path)
        dataframes[site_name] = df
        # print(df.head())

    except Exception as e:
        print(e)
        continue

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107fe5340>>
Traceback (most recent call last):
  File "/Users/weikuo/Documents/github-repositories/web-scrape-newswebsites/venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [95]:
# graph timeline
graph_timeline(dataframes, site_list)

In [93]:
graph_wing_ratio(dataframes, site_list)