In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas import json_normalize
import pickle
import math
from math import sqrt
import numpy as np
import os
from datetime import datetime, timedelta
import re
from wordcloud import WordCloud, STOPWORDS
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline  


pd.set_option('max_colwidth', 200)
%config Completer.use_jedi = False




In [2]:
def latexify(fig_width=None, fig_height=None, columns=2):
    """Set up matplotlib's RC params for LaTeX plotting.
    Call this before plotting a figure.

    Parameters
    ----------
    fig_width : float, optional, inches
    fig_height : float,  optional, inches
    columns : {1, 2}
    """

    # code adapted from http://www.scipy.org/Cookbook/Matplotlib/LaTeX_Examples

    # Width and max height in inches for IEEE journals taken from
    # computer.org/cms/Computer.org/Journal%20templates/transactions_art_guide.pdf

    assert(columns in [1,2])

    if fig_width is None:
        fig_width = 3.32 if columns==1 else 6.75 # width in inches

    if fig_height is None:
        golden_mean = (sqrt(5)-1.0)/2.0    # Aesthetic ratio
        fig_height = fig_width*golden_mean # height in inches

    MAX_HEIGHT_INCHES = 8.0
    if fig_height > MAX_HEIGHT_INCHES:
        print("WARNING: fig_height too large:" + fig_height + 
              "so will reduce to" + MAX_HEIGHT_INCHES + "inches.")
        fig_height = MAX_HEIGHT_INCHES

    params = {'backend': 'ps',
              'text.latex.preamble': [r'\usepackage{gensymb}'],
              'axes.labelsize': 8, # fontsize for x and y labels (was 10)
              'axes.titlesize': 8,
              'font.size': 8, # was 10
              'legend.fontsize': 8, # was 10
              'xtick.labelsize': 8,
              'ytick.labelsize': 8,
              'text.usetex': True,
              'figure.figsize': [fig_width,fig_height],
              'font.family': 'serif'
    }

    matplotlib.rcParams.update(params)


def format_axes(ax,twinx=False,SPINE_COLOR="gray"):
    
    if twinx:
        for spine in ['top']:
            ax.spines[spine].set_visible(False)

        for spine in ['left', 'bottom', 'right']:
            ax.spines[spine].set_color(SPINE_COLOR)
            ax.spines[spine].set_linewidth(0.5)
        
    else:
        for spine in ['top', 'right']:
            ax.spines[spine].set_visible(False)

        for spine in ['left', 'bottom']:
            ax.spines[spine].set_color(SPINE_COLOR)
            ax.spines[spine].set_linewidth(0.5)

    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

    for axis in [ax.xaxis, ax.yaxis]:
        axis.set_tick_params(direction='out', color=SPINE_COLOR)

    return ax

## Hindu & TOI, 2010-2020

In [3]:
article_df = pd.read_csv("../dataset/News_articles_dataset/News_articles_dataset.csv.gz")
article_df["date"] = pd.to_datetime(article_df["date"])

## TOI vs Hindu

In [4]:
def add_month_year_column(which_df):
    #make month column from date
    which_df["month"] = which_df.apply(lambda x:f'{x["date"].year}-{x["date"].month}',axis=1)

    #make year column from date
    which_df["year"] = which_df.apply(lambda x:f'{x["date"].year}',axis=1)
    
    return which_df


def get_article_count_df(article_df, start_date = datetime(2010,1,1).date(), end_date = datetime(2021,5,30).date()):
    '''
    count articles per day by applying group by date
    returns dataframe
    '''
    
    article_count = article_df.groupby(by="date")["heading"].count()
    
    #range of all date
    date_idx = pd.date_range(start_date, end_date)
    
    #get articles from specific range    
    article_count = article_count[start_date:end_date]

    #fill missing dates
    article_count = article_count.reindex(date_idx,fill_value=0)

    article_count =  article_count.reset_index()
    article_count.columns = ["date","count"]
    
    #add month & year columns
    article_count = add_month_year_column(article_count)
    return article_count

In [5]:
df = article_df.copy()
hindu = df[df["media"] == "Hindu"]
print(len(hindu))
toi = df[df["media"] == "TOI"]
print(len(toi))
hindu_count =  get_article_count_df(hindu)
toi_count =  get_article_count_df(toi)

hindu_count = hindu_count.resample("7D",on="date").sum().reset_index()
toi_count = toi_count.resample("7D",on="date").sum().reset_index()

5628
11746


In [None]:
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import matplotlib

latexify(fig_height=1.8,fig_width=4.5)
fig,ax = plt.subplots()


plt1 = ax.plot(toi_count["date"],toi_count["count"],linestyle="dashed",label="TOI",linewidth=1,color="tab:red")
plt2 = ax.plot(hindu_count["date"],hindu_count["count"],label="The Hindu",linewidth=1,color="tab:green")

ax.set(ylabel="Weekly AQ Articles", xlabel="Week")
format_axes(ax)
ax.legend(bbox_to_anchor=(1.02,0.9));

#set which month to show
ax.set_xlim(datetime(2009,11,1), datetime(2021,11,1))

#set format of date
date_form = DateFormatter("%b'%y")
#ax = fig.axes[0]
ax.xaxis.set_major_formatter(date_form)

#make interval of n months
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=24))
ax.tick_params(axis="x",rotation=0)
#fig.savefig("plots/latex/toi_vs_hindu_articles.pdf",bbox_inches="tight")


In [7]:
fig.savefig("../figures/fig2_weekly_AQ_articles_toi_hindu.pdf",bbox_inches="tight")