In [None]:
import pandas as pd
import swifter

arxiv_word_counts = pd.read_json('../data/clean/arxiv_word_counts.json',lines=True)
arxiv_word_counts = arxiv_word_counts.drop(columns=['text'])

nyt_word_counts = pd.read_json('../data/clean/nyt_word_counts.json',lines=True)
nyt_word_counts = nyt_word_counts.drop(columns=['text'])

product_word_counts = pd.read_json('../data/clean/prod_hunt_word_counts.json',lines=True)
product_word_counts = product_word_counts.drop(columns=['text'])

dfs = {
    'arxiv': arxiv_word_counts,
    'nyt': nyt_word_counts,
    'producthunt': product_word_counts
}


# Make aesthetic

In [100]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def make_aesthetic(hex_color_list=None, 
	with_gridlines=False, 
	bold_title=False, 
	save_transparent=False, 
	font_scale=2, 
	latex2arial = True
	):
    """Make Seaborn look clean and add space between title and plot"""
    
    # Note: To make some parts of title bold and others not bold, we have to use
    # latex rendering. This should work: 
    # plt.title(r'$\mathbf{bolded\ title}$' + '\n' + 'And a non-bold subtitle')

    
    sns.set(style='white', context='paper', font_scale=font_scale)
    if not hex_color_list:
		    # 2024-11-28: Reordered color list
        hex_color_list = [
            "#2C3531",  # Dark charcoal gray with green undertone
            "#D41876",  # Telemagenta
            "#00A896",  # Persian green
            "#826AED",  # Medium slate blue
            "#F45B69",  # Vibrant pinkish-red
            "#E3B505",  # Saffron
            "#89DAFF",  # Pale azure
            "#342E37",  # Dark grayish-purple
            "#7DCD85",  # Emerald
            "#F7B2AD",  # Melon
            "#D4B2D8",  # Pink lavender
            "#020887",  # Phthalo blue
            "#E87461",  # Medium-bright orange
            "#7E6551",  # Coyote
            "#F18805"   # Tangerine
        ]
    
    sns.set_palette(sns.color_palette(hex_color_list))

    # Update on 
    # 2024-11-29: I realized I can automatically 
    # clean variable names so i dont have to manually replace underscore
    
    # Enhanced typography settings
    plt.rcParams.update({
        # font settings
        'font.family': 'Arial',
        'font.weight': 'regular',
        'axes.labelsize': 11 * font_scale,
        'axes.titlesize': 14 * font_scale,
        'xtick.labelsize': 10 * font_scale,
        'ytick.labelsize': 10 * font_scale,
        'legend.fontsize': 10 * font_scale,
        
        # spines/grids
        'axes.spines.right': False,
        'axes.spines.top': False,
        'axes.spines.left': True,
        'axes.spines.bottom': True,
        'axes.linewidth': 0.8,  # Thinner spines
        'axes.grid': with_gridlines,
        'grid.alpha': 0.2,       
        'grid.linestyle': ':', 
        'grid.linewidth': 0.5,
        
        # title
        'axes.titlelocation': 'left',
        'axes.titleweight': 'bold' if bold_title else 'regular',
        'axes.titlepad': 15 * (font_scale / 1),
        
        # fig
        'figure.facecolor': 'white',
        'axes.facecolor': 'white',
        'figure.constrained_layout.use': True,
        'figure.constrained_layout.h_pad': 0.2,
        'figure.constrained_layout.w_pad': 0.2,
        
        # legend
        'legend.frameon': True,
        'legend.framealpha': 0.95,
        'legend.facecolor': 'white',
        'legend.borderpad': 0.4,
        'legend.borderaxespad': 1.0,
        'legend.handlelength': 1.5,
        'legend.handleheight': 0.7,
        'legend.handletextpad': 0.5,
        
        # export
        'savefig.dpi': 300,
        'savefig.transparent': save_transparent,
        'savefig.bbox': 'tight',
        'savefig.pad_inches': 0.2,
        'figure.autolayout': False,
        
         # do this for the bold hack
        'mathtext.fontset': 'custom',
        'mathtext.rm': 'Arial',
        'mathtext.it': 'Arial:italic',
        'mathtext.bf': 'Arial:bold'

    })
    
    return hex_color_list

def clean_vars(s, how='title'):
    """
    Simple function to clean titles

    Params
    - s: The string to clean
    - how (default='title'): How to return string. Can be either ['title', 'lowercase', 'uppercase']

    Returns
    - cleaned string
    """
    assert how in ['title', 'lowercase', 'uppercase'], "Bad option!! see docs"
    s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s)
    s = s.replace('_', ' ')
    if how == 'title':
        return s.title()
    elif how=='lower':
        return s.lower()
    elif how=='upper':
    	return s.upper()



mypal = make_aesthetic()



# Process dfs

In [101]:
dfs = {
   'arxiv': arxiv_word_counts,
   'nyt': nyt_word_counts,
   'producthunt': product_word_counts
}

byday = {}

for name, df in dfs.items():
   print(f"Dataset: {name}")
   print(f"Number of rows: {len(df)}")
   print(f"Number of columns: {len(df.columns)}")
   print(f"Columns: {df.columns.tolist()}")
   
   # get only ai
   df = df.query("ai_binary ==1")
   
   # dt
   df['dt'] = pd.to_datetime(df['analysis_date'], format='%Y-%m-%d', errors='coerce')
   df['total_count'] = 1  # dummy col for groupby to get total number of rows per day
   
   df['ai_compound_sum'] = df['ai_compound_roles_sum'] + df['ai_compound_nouns_sum']
   df['ai_compound_binary'] = df['ai_compound_sum'].swifter.apply(lambda x: 1 if x > 0 else 0)
   
   # Convert count columns to int
   sum_cols = [col for col in df.columns if "sum" in col]
   for col in sum_cols:
       df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
       
   # Convert binary columns to int
   binary_cols = [col for col in df.columns if "binary" in col]
   for col in binary_cols:
       df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
       df[f"{col}_sum"] = df[col]
   
   # Group by date and aggregate
   df_grouped = df.groupby(by=['dt']).agg({
       **{col: 'sum' for col in sum_cols},  # Sum for count columns
       **{col: 'mean' for col in binary_cols},  # Mean for binary columns, 
         **{f"{col}_sum": 'sum' for col in binary_cols},  # Sum for binary columns
       'total_count': 'sum'  # Sum for the total_count column
   }).reset_index()
   
   
   # Store in byday dictionary
   byday[name] = df_grouped

Dataset: arxiv
Number of rows: 613178
Number of columns: 18
Columns: ['ai_word_counts', 'social_word_counts', 'ai_compound_roles_word_counts', 'ai_compound_nouns_word_counts', 'ai_sum', 'social_sum', 'ai_compound_roles_sum', 'ai_compound_nouns_sum', 'ai_binary', 'social_binary', 'ai_compound_roles_binary', 'ai_compound_nouns_binary', 'analysis_date', 'unique_idx', 'dt', 'total_count', 'ai_compound_sum', 'ai_compound_binary']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dt'] = pd.to_datetime(df['analysis_date'], format='%Y-%m-%d', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_count'] = 1  # dummy col for groupby to get total number of rows per day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ai_compound_sum'] = df['ai_compound

Pandas Apply:   0%|          | 0/99487 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ai_compound_binary'] = df['ai_compound_sum'].swifter.apply(lambda x: 1 if x > 0 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='

Dataset: nyt
Number of rows: 383122
Number of columns: 18
Columns: ['ai_word_counts', 'social_word_counts', 'ai_compound_roles_word_counts', 'ai_compound_nouns_word_counts', 'ai_sum', 'social_sum', 'ai_compound_roles_sum', 'ai_compound_nouns_sum', 'ai_binary', 'social_binary', 'ai_compound_roles_binary', 'ai_compound_nouns_binary', 'analysis_date', 'unique_idx', 'dt', 'total_count', 'ai_compound_sum', 'ai_compound_binary']


Pandas Apply:   0%|          | 0/2008 [00:00<?, ?it/s]

Dataset: producthunt
Number of rows: 263635
Number of columns: 18
Columns: ['ai_word_counts', 'social_word_counts', 'ai_compound_roles_word_counts', 'ai_compound_nouns_word_counts', 'ai_sum', 'social_sum', 'ai_compound_roles_sum', 'ai_compound_nouns_sum', 'ai_binary', 'social_binary', 'ai_compound_roles_binary', 'ai_compound_nouns_binary', 'analysis_date', 'unique_idx', 'dt', 'total_count', 'ai_compound_sum', 'ai_compound_binary']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ai_compound_binary'] = df['ai_compound_sum'].swifter.apply(lambda x: 1 if x > 0 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='

Pandas Apply:   0%|          | 0/43427 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ai_compound_binary'] = df['ai_compound_sum'].swifter.apply(lambda x: 1 if x > 0 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='

# Plot ts (individual)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pandas as pd

def plot_ts(name, col, smoothing=False, period='daily', figsize=(12, 6)):
   """
   Plot time series for a given dataset and column.
   
   Parameters:
   -----------
   name : str
       Dataset name (key in byday dictionary)
   col : str
       Column name to plot
   smoothing : bool or int
       If True, applies 7-day rolling average. If int, uses that window size.
   period : str
       Aggregation period: 'daily', 'weekly', 'monthly', 'yearly'
   figsize : tuple
       Figure size (width, height)
   """
   
   clean_col = col.replace('_', ' ').title()
   if col == 'ai_compound_sum':
       clean_col = "Synthetic Social Learning Hits"
       
   
   source_dict = {
         'arxiv': 'arXiv',
         'nyt': 'NYT',
         'producthunt': 'Product Hunt'
   }
   clean_source = source_dict.get(name)
   if clean_col.startswith("Ai"):
         clean_col = clean_col.replace("Ai", "AI")
   
   if name not in byday:
       raise ValueError(f"Dataset '{name}' not found. Available: {list(byday.keys())}")
   
   df = byday[name].copy()
   
   if col not in df.columns:
       raise ValueError(f"Column '{col}' not found. Available: {df.columns.tolist()}")
   
   # Resample based on period
   if period == 'daily':
       df_plot = df.set_index('dt').resample('D')[col].sum().reset_index()
   elif period == 'weekly':
       df_plot = df.set_index('dt').resample('W')[col].mean().reset_index()
   elif period == 'monthly':
       df_plot = df.set_index('dt').resample('M')[col].mean().reset_index()
   elif period == 'yearly':
       df_plot = df.set_index('dt').resample('Y')[col].mean().reset_index()
   else:
       raise ValueError("Period must be 'daily', 'weekly', 'monthly', or 'yearly'")
   
   # Apply smoothing if requested
   if smoothing:
       window = 7 if smoothing is True else int(smoothing)
       df_plot[f'{col}_smooth'] = df_plot[col].rolling(window=window, center=True).mean()
       plot_col = f'{col}_smooth'
       line_label = f'{col} ({window}-period moving average)'
   else:
       plot_col = col
       line_label = col
   
   # Create plot
   plt.figure(figsize=figsize)
   plt.plot(df_plot['dt'], df_plot[plot_col])
   
   # Formatting
   plt.title(f'{clean_source} | {clean_col} ({period.title()})',  fontweight='bold')
   plt.xlabel('Date')
   plt.ylabel(col.replace('_', ' ').title())
   
   plt.xticks(rotation=45)
   plt.tight_layout()
   

   
   return df_plot

# Example usage:
for source in ['arxiv', 'producthunt', 'nyt']:
    for dv in ['social_sum', 'ai_compound_sum', 'social_binary']:
            for period in ['monthly', 'yearly']:
                print(f"Plotting {source} - {dv} ({period})")
                plot_ts(source, dv, smoothing=False, period=period)
                plt.show()




it has # grid plot

In [None]:
sorted(byday['arxiv'].columns)

# Grid plot of all time series

In [106]:
import matplotlib.pyplot as plt

def save_all_ts_grid(
    sources=('arxiv', 'producthunt', 'nyt'),
    dvs=('social_sum', 'social_binary', 'ai_compound_sum', 'ai_compound_binary'),
    periods=('monthly',),
    smoothing=False,                 # or e.g. 7 for a 7-day MA
    figsize=(25, 20),                # tweak as needed
    dpi=300,                         # high-resolution
    out_fn='all_time_series.png'
):
    """
    Saves a grid of time-series plots (source × period × DV) to `out_fn`.
    Assumes `byday` is in scope and `plot_ts()` is defined.
    """
    n_rows = len(sources) * len(periods)   # 3 × 2 = 6
    n_cols = len(dvs)                      # 3
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, sharex=False, sharey=False)
    
    color_map = {
        'arxiv': mypal[0],
        'producthunt': mypal[1],
        'nyt': mypal[2],
    }

    # Convenience: turn 2-D axes array into iterator that matches our loop order
    axes_iter = axes.reshape(-1, n_cols)

    for r, (source, period) in enumerate(
        [(s, p) for s in sources for p in periods]   # keeps the grid order stable
    ):
        for c, dv in enumerate(dvs):
            ax = axes_iter[r, c]

            # --- inline version of plot_ts but drawing on a supplied axis ---
            df = byday[source].copy()

            # Aggregate to the requested period
            if period == 'daily':
                df_plot = df.set_index('dt').resample('D')[dv].sum().reset_index()
            elif period == 'weekly':
                df_plot = df.set_index('dt').resample('W')[dv].mean().reset_index()
            elif period == 'monthly':
                df_plot = df.set_index('dt').resample('M')[dv].mean().reset_index()
            elif period == 'yearly':
                df_plot = df.set_index('dt').resample('Y')[dv].mean().reset_index()
            else:
                raise ValueError("period must be 'daily', 'weekly', 'monthly', or 'yearly'")

            # Optional smoothing
            plot_col = dv
            if smoothing:
                window = 7 if smoothing is True else int(smoothing)
                df_plot[f'{dv}_smooth'] = df_plot[dv].rolling(window, center=True).mean()
                plot_col = f'{dv}_smooth'

            # Draw
            ax.plot(df_plot['dt'], df_plot[plot_col], linewidth=1.8, color=color_map[source])
            clean_dv = dv.replace('_', ' ').title()
            ax.set_title(f'{source} | {clean_dv}\n({period})', pad=6)
            ax.tick_params(axis='x', rotation=45)
            ax.tick_params(axis='y')
            ax.grid(alpha=0.3)

            # Light y-label only on leftmost column
            if c == 0:
                ax.set_ylabel('Value', fontsize=8)

    fig.tight_layout(h_pad=2.0)
    fig.savefig(out_fn, dpi=dpi, bbox_inches='tight')
    plt.close(fig)
    print(f"✅  Saved grid to {out_fn}")

# -------------------------------------------------
# Example call (produces all 18 plots in one file):
save_all_ts_grid()


  fig.tight_layout(h_pad=2.0)


✅  Saved grid to all_time_series.png


In [ ]:
mypal = 