In [2]:
import sys
sys.path.append('..')
import configparser
import requests
from datetime import datetime
import json
import os
from src.utility import setup_logger, fetch_config
import pandas as pd
import time
from src.api_extarction import GuardianAPI

config, config_path = fetch_config() 
GUARDIAN_API_KEY = config.get('guardian_api', 'api_key')
logger = setup_logger(__name__)

In [3]:
import sys
sys.path.append('..')
import configparser
import requests
from datetime import datetime
import json
import os
from src.utility import setup_logger, fetch_config
import pandas as pd
import time

class GuardianAPI:
    def __init__(self):
        self.config, self.config_path = fetch_config() 
        self.GUARDIAN_API_KEY = self.config.get('guardian_api2', 'api_key')
        # 'f155b662-7be4-4e65-8a36-110369468534'
        self.logger = setup_logger(__name__)
        self.session = requests.Session()  

    def guardian_search(self, search_query, from_date, to_date, max_retries=3):
        url = 'http://content.guardianapis.com/search'
        params = {
            'q': search_query.replace(' ', '%20'),
            'from-date': from_date,
            'to-date': to_date,
            'page-size': 100,
            'api-key': self.GUARDIAN_API_KEY,
            'show-fields': 'headline,byline,sectionName,webPublicationDate'
        }
        all_results = []
        current_page = 1
        total_pages = 1
        retry_count = 0
        retry_delay = 5  # Initial delay

        while current_page <= total_pages:
            params['page'] = current_page
            response = self.session.get(url, params=params)
           
            if response.status_code == 200:
                data = response.json()
                all_results.extend(data['response']['results'])
                total_pages = data['response']['pages']
                self.logger.info(f"Processed page {current_page}/{total_pages}. Total results so far: {len(all_results)}.")
                current_page += 1
                retry_count = 0  # Reset retry count after a successful request
                retry_delay = 5  # Reset delay
            else:
                retry_count += 1
                if retry_count <= max_retries:
                    self.logger.warning(f"API request failed with status code {response.status_code}. Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential back-off
                else:
                    self.logger.error(f"API request failed with status code {response.status_code} after {max_retries} retries.")
                    raise Exception(f"API request failed with status code {response.status_code} after {max_retries} retries.")

        self.logger.info(f"Retrieved {len(all_results)} articles for search query: {search_query}")
        return all_results
        
    def results_to_dataframe(self, results, filter_query=None):
        try:
            if filter_query:
                filtered_results = [
                    result for result in results
                    if filter_query.lower() in result.get('webTitle', '').lower()
                    or filter_query.lower() in result.get('fields', {}).get('headline', '').lower()
                ]
                
                df = pd.DataFrame(filtered_results)
                
                if df.empty:
                    self.logger.warning(f"No data found for '{filter_query}'.")
                    return None
                
                self.logger.info(f"Data filtered for '{filter_query}': {len(df)} rows")
                return df
            else:
                df = pd.DataFrame(results)
                if df.empty:
                    self.logger.warning("No data was extracted from the API.")
                    return None
                
                self.logger.info(f"No filter query provided. Returning the entire DataFrame with {len(df)} rows")
                return df
        
        except Exception as e:
            self.logger.error("An error occurred while converting results to a DataFrame.")
            self.logger.exception(e)
            return None

In [4]:
guardianapi = GuardianAPI()
search_query = "Justin Trudeau"
from_date = "2018-01-01"
to_date = datetime.now().strftime("%Y-%m-%d")
articles = guardianapi.guardian_search(search_query, from_date, to_date)
df = guardianapi.results_to_dataframe(articles, search_query)


In [5]:
df.describe()
df.value_counts(['type', 'sectionName'])

type      sectionName       
article   World news            47
          Opinion                9
          US news                3
          Environment            2
          Business               1
          Life and style         1
          Technology             1
          Television & radio     1
liveblog  Business               1
dtype: int64

In [6]:

def count_trudeau_articles(df):

    publication_dates = pd.to_datetime(df['webPublicationDate']).dt.date
    start_date = pd.to_datetime('2018-01-01').date()
    end_date = pd.to_datetime('today').date()
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    article_counts = pd.DataFrame({'Date': all_dates})
    article_counts['No. of articles'] = article_counts['Date'].map(publication_dates.value_counts()).fillna(0).astype(int)
    average_articles_per_day = article_counts['No. of articles'].mean()
    return article_counts[['Date', 'No. of articles']], average_articles_per_day


def get_top_section(df):
    section_counts = df['sectionName'].value_counts()
    top_section = section_counts.index[0]
    top_section_count = section_counts.iloc[0]
    return top_section, top_section_count


article_count, avg_count = count_trudeau_articles(df)
print(f"the average article per day is {avg_count:.2f}")

the average article per day is 0.03


In [7]:
import duckdb
conn = duckdb.connect()

# Using a direct SQL query to generate one row per date
result = conn.execute("""
    with date_series as(
        SELECT day::date as complete_date
        FROM generate_series('2018-01-01'::date, current_date, INTERVAL '1 day') as series(day)
    ),
    article_count as(
        select date_series.complete_date Date, coalesce(count(df.webPublicationDate), 0) as No_of_articles
        from date_series left join df 
        on date_series.complete_date = df.webPublicationDate::date and df.type = 'article'
        group by date_series.complete_date
    ),
    average_article as (
        select Date, No_of_articles, avg(No_of_articles) over() as avg_articles_per_day
        from article_count
    )
    select * from average_article order by Date
""").df()  

print(result.head())


        Date  No_of_articles  avg_articles_per_day
0 2018-01-01               0              0.028175
1 2018-01-02               0              0.028175
2 2018-01-03               0              0.028175
3 2018-01-04               0              0.028175
4 2018-01-05               0              0.028175


In [20]:
def count_trudeau_articles(df):
    con = duckdb.connect()
    query = """ 
        with date_series as(
            SELECT day::date as complete_date
            FROM generate_series('2018-01-01'::date, current_date, INTERVAL '1 day') as series(day)
        ),
        article_count as(
            select date_series.complete_date Date, coalesce(count(df.webPublicationDate), 0) as No_of_articles
            from date_series left join df 
            on date_series.complete_date = df.webPublicationDate::date and df.type = 'article'
            group by date_series.complete_date
        ),
        average_article as (
            select Date, No_of_articles, avg(No_of_articles) over() as avg_articles_per_day
            from article_count
        )
        select * from average_article order by Date
    """

    result = con.execute(query).fetchdf()
    con.close()
    if not result.empty and 'avg_articles_per_day' in result.columns:
        avg_count = result['avg_articles_per_day'].iloc[0]
    else:
        avg_count = -999
    return result, avg_count

article_count, avg_count = count_trudeau_articles(df)
print(f"the average article per day is {avg_count:.2f}")

the average article per day is 0.03


In [9]:
# import duckdb

# def get_top_section_sql(df):
#     con = duckdb.connect()
#     query = """
#     SELECT sectionName, COUNT(sectionName) AS No_of_articles
#     FROM df
#     GROUP BY sectionName
#     ORDER BY No_of_articles DESC
    
#     """
#     result = con.execute(query).fetchdf()
#     con.close()
#     if not result.empty:
#         top_section = result['sectionName'].iloc[0]
#         top_section_count = result['No_of_articles'].iloc[0]
#     else:
#         top_section = None
#         top_section_count = 0
#     return top_section, top_section_count

# get_top_section_sql(df)

('World news', 47)

In [11]:
def get_top_section(df):
    con = duckdb.connect()
    query = """
        SELECT sectionName, count(*) as section_count
        FROM df
        WHERE type = 'article'
        GROUP BY sectionName
        ORDER BY section_count DESC
    """
    result = con.execute(query).fetchdf()
    con.close()
    
    return result

get_top_section(df)

Unnamed: 0,sectionName,section_count
0,World news,47
1,Opinion,9
2,US news,3
3,Environment,2
4,Television & radio,1
5,Life and style,1
6,Business,1
7,Technology,1


In [12]:
top_sections = get_top_section(df)

In [54]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

def generate_section_report(df):
    # Assuming 'df' is your DataFrame with the 'sectionName' and 'section_count'
    
    # Create a Plotly Express bar chart
    fig = px.bar(df, x='sectionName', y='section_count', title='Articles by Section')
    
    # Save the plot as an HTML file
    output_path = '/workspaces/GuardianApiAnalysis/guardian_media_analysis/reports/section_report.html'
    fig.write_html(output_path)

top_sections = get_top_section(df)
generate_section_report(top_sections)

In [57]:
import os
import plotly.express as px

def generate_top_section_report(df, output_path):
    # Create the bar chart
    fig = px.bar(df, x='sectionName', y='section_count', 
                 title='Number of Articles by Section',
                 labels={'sectionName': 'Section Name', 'section_count': 'Number of Articles'})

    # Create the HTML content
    html_content = '''
        <html>
            <head>
                <title>Top Sections by Article Count</title>
                <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
            </head>
            <body>
                <h1>Top Sections by Article Count</h1>
                <div id="plot"></div>
                <script>
                    var data = {};
                    Plotly.newPlot('plot', data);
                </script>
            </body>
        </html>
    '''.format(fig.to_json())

    # Save the HTML file
    with open(output_path, 'w') as file:
        file.write(html_content)

# Example usage
top_section_df = get_top_section(df)
output_path = '/workspaces/GuardianApiAnalysis/guardian_media_analysis/reports/top_section_report.html'
generate_top_section_report(top_section_df, output_path)

In [13]:
top_section_df = get_top_section(df)
top_section_df.columns

Index(['sectionName', 'section_count'], dtype='object')

In [79]:
! pip install nbformat>=4.2.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [56]:


import plotly.express as px

top_section_df = get_top_section(df)
def plot_articles_by_section(top_section_df):
    fig = px.bar(top_section_df, x='sectionName', y='section_count', 
                 title='Number of Articles by Section',
                 labels={'sectionName': 'Section Name', 'section_count': 'Number of Articles'})
    fig.show()

plot_articles_by_section(top_section_df)


In [32]:
article_count, avg_count = count_trudeau_articles(df)
article_count.head()

Unnamed: 0,Date,No_of_articles,avg_articles_per_day
0,2018-01-01,0,0.028175
1,2018-01-02,0,0.028175
2,2018-01-03,0,0.028175
3,2018-01-04,0,0.028175
4,2018-01-05,0,0.028175


In [25]:
fig = px.line(article_count, x='Date', y='No_of_articles', 
              title='Evolution of Article Count over Time',
              labels={'Date': 'Date', 'No_of_articles': 'Number of Articles'})
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Articles')
fig.show()

In [67]:
import duckdb
import plotly.graph_objects as go

def plot_article_count(article_count, group_by='month'):
    con = duckdb.connect()

    query = f"""
        SELECT DATE_TRUNC('{group_by}', Date) AS group_date, SUM(No_of_articles) AS total_articles
        FROM article_count
        GROUP BY group_date
        ORDER BY group_date
    """
    grouped_data = con.execute(query).df()

    # Set the x-axis title and tick format based on the group_by parameter
    if group_by == 'month':
        x_title = 'Month'
        tick_format = '%b %Y'
    elif group_by == 'year':
        x_title = 'Year'
        tick_format = '%Y'
    elif group_by == 'week':
        x_title = 'Week'
        tick_format = '%Y-W%W'
    elif group_by == 'day':
        x_title = 'Day'
        tick_format = '%Y-%m-%d'
    else:
        raise ValueError("Invalid group_by value. Allowed values are 'month', 'year', 'week', or 'day'.")

    fig = go.Figure(data=[go.Bar(x=grouped_data['group_date'], y=grouped_data['total_articles'],
                                 name='Number of Articles', marker_color='#1f77b4', marker_opacity=0.8)])

    fig.update_layout(
        title='<b>Number of Articles about Justin Trudeau</b>',
        xaxis_title=x_title,
        yaxis_title='Number of Articles',
        xaxis=dict(
            tickformat=tick_format
        ),
        showlegend=True,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    fig.show()

plot_article_count(article_count, group_by='week')

In [68]:
import numpy as np
def identify_unusual_events(article_count, threshold=2):
    # Calculate the mean and standard deviation of the number of articles per day
    mean_articles = np.mean(article_count['No_of_articles'])
    std_articles = np.std(article_count['No_of_articles'])

    # Identify unusual events based on the threshold
    unusual_events = article_count[(article_count['No_of_articles'] > mean_articles + threshold * std_articles) |
                                   (article_count['No_of_articles'] < mean_articles - threshold * std_articles)]

    # Sort the unusual events by the number of articles in descending order
    unusual_events = unusual_events.sort_values('No_of_articles', ascending=False)

    return unusual_events

# Identify unusual events
unusual_events = identify_unusual_events(article_count, threshold=2)

# Print the unusual events
print("Unusual events:")
print(unusual_events)

Unusual events:
           Date  No_of_articles  avg_articles_per_day
627  2019-09-20               3              0.028175
626  2019-09-19               3              0.028175
738  2020-01-09               2              0.028175
802  2020-03-13               2              0.028175
37   2018-02-07               2              0.028175
424  2019-03-01               2              0.028175
1284 2021-07-08               1              0.028175
805  2020-03-16               1              0.028175
812  2020-03-23               1              0.028175
886  2020-06-05               1              0.028175
914  2020-07-03               1              0.028175
917  2020-07-06               1              0.028175
921  2020-07-10               1              0.028175
1030 2020-10-27               1              0.028175
1059 2020-11-25               1              0.028175
1258 2021-06-12               1              0.028175
22   2018-01-23               1              0.028175
781  2020-02

In [62]:
import plotly.graph_objects as go

def create_box_plot(article_count):
    fig = go.Figure()

    # Create a text string with the date and number of articles for each data point
    hover_text = [f"Date: {date}<br>Number of Articles: {count}" for date, count in
                  zip(article_count['Date'], article_count['No_of_articles'])]

    fig.add_trace(go.Box(
        y=article_count['No_of_articles'],
        name='Number of Articles',
        boxpoints='suspectedoutliers',  # Display suspected outliers
        text=hover_text,  # Set the hover text
        hoverinfo='text',  # Display only the hover text
        marker=dict(
            color='rgb(8,81,156)',
            outliercolor='rgba(219, 64, 82, 0.6)',
            line=dict(outliercolor='rgba(219, 64, 82, 0.6)', outlierwidth=2)
        )
    ))

    fig.update_layout(
        title='Box Plot of Number of Articles',
        yaxis_title='Number of Articles',
        showlegend=False
    )

    fig.show()

# Create the box plot
create_box_plot(article_count)