In [None]:
%cd ..

In [None]:
# third party
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# local
from src.config_ import PATHS
from src.pd_helpers import count_per_week
from src.doc_analysis import load_lexisnexis_data, most_common

In [None]:
df = load_lexisnexis_data()

## Counts per column and source

In [None]:
print("Total no. of articles: ", df.body.count().sum())
df.groupby('source').count().T

## Most common bylines by source

In [None]:
most_common(df, 'byline')

## Statistical overview of article length per source

In [None]:
df.groupby('source').length.agg(['min', 'max', 'mean', 'median', 'std']).round(0).astype('int').T

In [None]:
df['length_bin'] = pd.cut(df['length'], range(0, 2250, 250))
df_bin = df.groupby(['source', 'length_bin']).length.count()
df_bin.unstack('source').plot.bar(figsize=(12,5))

## Number of articles per section and source

In [None]:
table = most_common(df, 'section')
table.to_html(PATHS.results / 'most_common_sections_per_source.html')
table

## Statistical overview of where in the paper the articles mostly appear

In [None]:
df.groupby('source').page.agg(['min', 'max', 'mean', 'median', 'std']).round(1).T

In [None]:
df['page_bin'] = pd.cut(df['page'], range(0, 50, 5))
df_bin = df.groupby(['source', 'page_bin']).page.count()
df_bin.unstack('source').plot.bar(figsize=(12,5))

## Overview of when articles appeared in 2017

In [None]:
df_agg = count_per_week(df, 'publication_date', 'source')
df_agg = df_agg.set_index('source', append=True).unstack('source')
df_agg.index = range(1, 52, 1)
df_agg.plot(kind='bar', figsize=(12,15), subplots=True)

In [None]:
df_agg = count_per_week(df, 'publication_date', 'source')
df_agg = df_agg.set_index('source', append=True).unstack('source')
df_agg.index = range(1, 52, 1)
df_agg.plot(kind='line', figsize=(12,5), subplots=False)