In [2]:
import sys
sys.path.append('..')
import configparser
import requests
from datetime import datetime
import json
import os
from src.utility import setup_logger, fetch_config
import pandas as pd
import time
from src.api_extarction import GuardianAPI

config, config_path = fetch_config() 
GUARDIAN_API_KEY = config.get('guardian_api', 'api_key')
logger = setup_logger(__name__)

In [3]:
import sys
sys.path.append('..')
import configparser
import requests
from datetime import datetime
import json
import os
from src.utility import setup_logger, fetch_config
import pandas as pd
import time

class GuardianAPI:
    def __init__(self):
        self.config, self.config_path = fetch_config() 
        self.GUARDIAN_API_KEY = self.config.get('guardian_api2', 'api_key')
        # 'f155b662-7be4-4e65-8a36-110369468534'
        self.logger = setup_logger(__name__)
        self.session = requests.Session()  

    def guardian_search(self, search_query, from_date, to_date, max_retries=3):
        url = 'http://content.guardianapis.com/search'
        params = {
            'q': search_query.replace(' ', '%20'),
            'from-date': from_date,
            'to-date': to_date,
            'page-size': 100,
            'api-key': self.GUARDIAN_API_KEY,
            'show-fields': 'headline,byline,sectionName,webPublicationDate'
        }
        all_results = []
        current_page = 1
        total_pages = 1
        retry_count = 0
        retry_delay = 5  # Initial delay

        while current_page <= total_pages:
            params['page'] = current_page
            response = self.session.get(url, params=params)
           
            if response.status_code == 200:
                data = response.json()
                all_results.extend(data['response']['results'])
                total_pages = data['response']['pages']
                self.logger.info(f"Processed page {current_page}/{total_pages}. Total results so far: {len(all_results)}.")
                current_page += 1
                retry_count = 0  # Reset retry count after a successful request
                retry_delay = 5  # Reset delay
            else:
                retry_count += 1
                if retry_count <= max_retries:
                    self.logger.warning(f"API request failed with status code {response.status_code}. Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential back-off
                else:
                    self.logger.error(f"API request failed with status code {response.status_code} after {max_retries} retries.")
                    raise Exception(f"API request failed with status code {response.status_code} after {max_retries} retries.")

        self.logger.info(f"Retrieved {len(all_results)} articles for search query: {search_query}")
        return all_results
        
    def results_to_dataframe(self, results, filter_query=None):
        try:
            if filter_query:
                filtered_results = [
                    result for result in results
                    if filter_query.lower() in result.get('webTitle', '').lower()
                    or filter_query.lower() in result.get('fields', {}).get('headline', '').lower()
                ]
                
                df = pd.DataFrame(filtered_results)
                
                if df.empty:
                    self.logger.warning(f"No data found for '{filter_query}'.")
                    return None
                
                self.logger.info(f"Data filtered for '{filter_query}': {len(df)} rows")
                return df
            else:
                df = pd.DataFrame(results)
                if df.empty:
                    self.logger.warning("No data was extracted from the API.")
                    return None
                
                self.logger.info(f"No filter query provided. Returning the entire DataFrame with {len(df)} rows")
                return df
        
        except Exception as e:
            self.logger.error("An error occurred while converting results to a DataFrame.")
            self.logger.exception(e)
            return None

In [4]:
guardianapi = GuardianAPI()
search_query = "Justin Trudeau"
from_date = "2018-01-01"
to_date = datetime.now().strftime("%Y-%m-%d")
articles = guardianapi.guardian_search(search_query, from_date, to_date)
df = guardianapi.results_to_dataframe(articles, search_query)


In [5]:
df.describe()
df.value_counts(['type', 'sectionName'])

type      sectionName       
article   World news            47
          Opinion                9
          US news                3
          Environment            2
          Business               1
          Life and style         1
          Technology             1
          Television & radio     1
liveblog  Business               1
dtype: int64

In [6]:

def count_trudeau_articles(df):

    publication_dates = pd.to_datetime(df['webPublicationDate']).dt.date
    start_date = pd.to_datetime('2018-01-01').date()
    end_date = pd.to_datetime('today').date()
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    article_counts = pd.DataFrame({'Date': all_dates})
    article_counts['No. of articles'] = article_counts['Date'].map(publication_dates.value_counts()).fillna(0).astype(int)
    average_articles_per_day = article_counts['No. of articles'].mean()
    return article_counts[['Date', 'No. of articles']], average_articles_per_day


def get_top_section(df):
    section_counts = df['sectionName'].value_counts()
    top_section = section_counts.index[0]
    top_section_count = section_counts.iloc[0]
    return top_section, top_section_count


article_count, avg_count = count_trudeau_articles(df)
print(f"the average article per day is {avg_count:.2f}")

the average article per day is 0.03


In [7]:
import duckdb
conn = duckdb.connect()

# Using a direct SQL query to generate one row per date
result = conn.execute("""
    with date_series as(
        SELECT day::date as complete_date
        FROM generate_series('2018-01-01'::date, current_date, INTERVAL '1 day') as series(day)
    ),
    article_count as(
        select date_series.complete_date Date, coalesce(count(df.webPublicationDate), 0) as No_of_articles
        from date_series left join df 
        on date_series.complete_date = df.webPublicationDate::date and df.type = 'article'
        group by date_series.complete_date
    ),
    average_article as (
        select Date, No_of_articles, avg(No_of_articles) over() as avg_articles_per_day
        from article_count
    )
    select * from average_article order by Date
""").df()  

print(result.head())


        Date  No_of_articles  avg_articles_per_day
0 2018-01-01               0              0.028175
1 2018-01-02               0              0.028175
2 2018-01-03               0              0.028175
3 2018-01-04               0              0.028175
4 2018-01-05               0              0.028175


In [83]:
def count_trudeau_articles(df):
    con = duckdb.connect()
    query = """ 
        with date_series as(
            SELECT day::date as complete_date
            FROM generate_series('2018-01-01'::date, current_date, INTERVAL '1 day') as series(day)
        ),
        article_count as(
            select date_series.complete_date Date, coalesce(count(df.webPublicationDate), 0) as No_of_articles
            from date_series left join df 
            on date_series.complete_date = df.webPublicationDate::date and df.type = 'article'
            group by date_series.complete_date
        ),
        average_article as (
            select Date, No_of_articles, avg(No_of_articles) over() as avg_articles_per_day
            from article_count
        )
        select * from average_article order by Date
    """

    result = con.execute(query).fetchdf()
    con.close()
    if not result.empty and 'avg_articles_per_day' in result.columns:
        avg_count = result['avg_articles_per_day'].iloc[0]
    else:
        avg_count = -999
    return result, avg_count

article_count, avg_count = count_trudeau_articles(df)
#article_count

print(f"the average article per day is {avg_count:.2f}")


the average article per day is 0.03


In [81]:
def count_trudeau_articles_since_2018(df):
    con = duckdb.connect()
    query = """
        SELECT COUNT(*) AS total_articles
        FROM (
            SELECT CAST(webPublicationDate AS DATE) AS date, COUNT(*) AS article_count
            FROM df
            WHERE type = 'article'
              AND webPublicationDate >= '2018-01-01'
            GROUP BY CAST(webPublicationDate AS DATE)
        ) AS daily_counts
        WHERE article_count > 0
    """

    result = con.execute(query).fetchdf()
    con.close()
    if not result.empty and 'total_articles' in result.columns:
        total_count = result['total_articles'].iloc[0]
    else:
        total_count = 0
    return total_count

total_articles = count_trudeau_articles_since_2018(df)
print(f"Total articles about Justin Trudeau since 2018-01-01: {total_articles}")

Total articles about Justin Trudeau since 2018-01-01: 57


In [9]:
# import duckdb

# def get_top_section_sql(df):
#     con = duckdb.connect()
#     query = """
#     SELECT sectionName, COUNT(sectionName) AS No_of_articles
#     FROM df
#     GROUP BY sectionName
#     ORDER BY No_of_articles DESC
    
#     """
#     result = con.execute(query).fetchdf()
#     con.close()
#     if not result.empty:
#         top_section = result['sectionName'].iloc[0]
#         top_section_count = result['No_of_articles'].iloc[0]
#     else:
#         top_section = None
#         top_section_count = 0
#     return top_section, top_section_count

# get_top_section_sql(df)

('World news', 47)

In [11]:
def get_top_section(df):
    con = duckdb.connect()
    query = """
        SELECT sectionName, count(*) as section_count
        FROM df
        WHERE type = 'article'
        GROUP BY sectionName
        ORDER BY section_count DESC
    """
    result = con.execute(query).fetchdf()
    con.close()
    
    return result

get_top_section(df)

Unnamed: 0,sectionName,section_count
0,World news,47
1,Opinion,9
2,US news,3
3,Environment,2
4,Television & radio,1
5,Life and style,1
6,Business,1
7,Technology,1


In [12]:
top_sections = get_top_section(df)

In [54]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

def generate_section_report(df):
    # Assuming 'df' is your DataFrame with the 'sectionName' and 'section_count'
    
    # Create a Plotly Express bar chart
    fig = px.bar(df, x='sectionName', y='section_count', title='Articles by Section')
    
    # Save the plot as an HTML file
    output_path = '/workspaces/GuardianApiAnalysis/guardian_media_analysis/reports/section_report.html'
    fig.write_html(output_path)

top_sections = get_top_section(df)
generate_section_report(top_sections)

In [57]:
import os
import plotly.express as px

def generate_top_section_report(df, output_path):
    # Create the bar chart
    fig = px.bar(df, x='sectionName', y='section_count', 
                 title='Number of Articles by Section',
                 labels={'sectionName': 'Section Name', 'section_count': 'Number of Articles'})

    # Create the HTML content
    html_content = '''
        <html>
            <head>
                <title>Top Sections by Article Count</title>
                <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
            </head>
            <body>
                <h1>Top Sections by Article Count</h1>
                <div id="plot"></div>
                <script>
                    var data = {};
                    Plotly.newPlot('plot', data);
                </script>
            </body>
        </html>
    '''.format(fig.to_json())

    # Save the HTML file
    with open(output_path, 'w') as file:
        file.write(html_content)

# Example usage
top_section_df = get_top_section(df)
output_path = '/workspaces/GuardianApiAnalysis/guardian_media_analysis/reports/top_section_report.html'
generate_top_section_report(top_section_df, output_path)

In [13]:
top_section_df = get_top_section(df)
top_section_df.columns

Index(['sectionName', 'section_count'], dtype='object')

In [79]:
! pip install nbformat>=4.2.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [56]:


import plotly.express as px

top_section_df = get_top_section(df)
def plot_articles_by_section(top_section_df):
    fig = px.bar(top_section_df, x='sectionName', y='section_count', 
                 title='Number of Articles by Section',
                 labels={'sectionName': 'Section Name', 'section_count': 'Number of Articles'})
    fig.show()

plot_articles_by_section(top_section_df)


In [32]:
article_count, avg_count = count_trudeau_articles(df)
article_count.head()

Unnamed: 0,Date,No_of_articles,avg_articles_per_day
0,2018-01-01,0,0.028175
1,2018-01-02,0,0.028175
2,2018-01-03,0,0.028175
3,2018-01-04,0,0.028175
4,2018-01-05,0,0.028175


In [25]:
fig = px.line(article_count, x='Date', y='No_of_articles', 
              title='Evolution of Article Count over Time',
              labels={'Date': 'Date', 'No_of_articles': 'Number of Articles'})
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Articles')
fig.show()

In [67]:
import duckdb
import plotly.graph_objects as go

def plot_article_count(article_count, group_by='month'):
    con = duckdb.connect()

    query = f"""
        SELECT DATE_TRUNC('{group_by}', Date) AS group_date, SUM(No_of_articles) AS total_articles
        FROM article_count
        GROUP BY group_date
        ORDER BY group_date
    """
    grouped_data = con.execute(query).df()

    # Set the x-axis title and tick format based on the group_by parameter
    if group_by == 'month':
        x_title = 'Month'
        tick_format = '%b %Y'
    elif group_by == 'year':
        x_title = 'Year'
        tick_format = '%Y'
    elif group_by == 'week':
        x_title = 'Week'
        tick_format = '%Y-W%W'
    elif group_by == 'day':
        x_title = 'Day'
        tick_format = '%Y-%m-%d'
    else:
        raise ValueError("Invalid group_by value. Allowed values are 'month', 'year', 'week', or 'day'.")

    fig = go.Figure(data=[go.Bar(x=grouped_data['group_date'], y=grouped_data['total_articles'],
                                 name='Number of Articles', marker_color='#1f77b4', marker_opacity=0.8)])

    fig.update_layout(
        title='<b>Number of Articles about Justin Trudeau</b>',
        xaxis_title=x_title,
        yaxis_title='Number of Articles',
        xaxis=dict(
            tickformat=tick_format
        ),
        showlegend=True,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    fig.show()

plot_article_count(article_count, group_by='month')

In [68]:
import numpy as np
def identify_unusual_events(article_count, threshold=2):
    # Calculate the mean and standard deviation of the number of articles per day
    mean_articles = np.mean(article_count['No_of_articles'])
    std_articles = np.std(article_count['No_of_articles'])

    # Identify unusual events based on the threshold
    unusual_events = article_count[(article_count['No_of_articles'] > mean_articles + threshold * std_articles) |
                                   (article_count['No_of_articles'] < mean_articles - threshold * std_articles)]

    # Sort the unusual events by the number of articles in descending order
    unusual_events = unusual_events.sort_values('No_of_articles', ascending=False)

    return unusual_events

# Identify unusual events
unusual_events = identify_unusual_events(article_count, threshold=2)

# Print the unusual events
print("Unusual events:")
print(unusual_events)

Unusual events:
           Date  No_of_articles  avg_articles_per_day
627  2019-09-20               3              0.028175
626  2019-09-19               3              0.028175
738  2020-01-09               2              0.028175
802  2020-03-13               2              0.028175
37   2018-02-07               2              0.028175
424  2019-03-01               2              0.028175
1284 2021-07-08               1              0.028175
805  2020-03-16               1              0.028175
812  2020-03-23               1              0.028175
886  2020-06-05               1              0.028175
914  2020-07-03               1              0.028175
917  2020-07-06               1              0.028175
921  2020-07-10               1              0.028175
1030 2020-10-27               1              0.028175
1059 2020-11-25               1              0.028175
1258 2021-06-12               1              0.028175
22   2018-01-23               1              0.028175
781  2020-02

In [62]:
import plotly.graph_objects as go

def create_box_plot(article_count):
    fig = go.Figure()

    # Create a text string with the date and number of articles for each data point
    hover_text = [f"Date: {date}<br>Number of Articles: {count}" for date, count in
                  zip(article_count['Date'], article_count['No_of_articles'])]

    fig.add_trace(go.Box(
        y=article_count['No_of_articles'],
        name='Number of Articles',
        boxpoints='suspectedoutliers',  # Display suspected outliers
        text=hover_text,  # Set the hover text
        hoverinfo='text',  # Display only the hover text
        marker=dict(
            color='rgb(8,81,156)',
            outliercolor='rgba(219, 64, 82, 0.6)',
            line=dict(outliercolor='rgba(219, 64, 82, 0.6)', outlierwidth=2)
        )
    ))

    fig.update_layout(
        title='Box Plot of Number of Articles',
        yaxis_title='Number of Articles',
        showlegend=False
    )

    fig.show()

# Create the box plot
create_box_plot(article_count)

In [76]:
def display_unusual_articles(df, unusual_dates):
    df['webPublicationDate'] = pd.to_datetime(df['webPublicationDate'])
    
    unusual_articles = df[(df['webPublicationDate'].dt.date.astype(str).isin(unusual_dates)) & (df['type'] == 'article')]
    
    for date in unusual_dates:
        print(f"Unusual Event Date: {date}")
        print("Article Headlines:")
        articles_on_date = unusual_articles[unusual_articles['webPublicationDate'].dt.date.astype(str) == date]
        
        if articles_on_date.empty:
            print("No articles found for this date.")
        else:
            for _, article in articles_on_date.iterrows():
                print(f"- {article['webTitle']}")
        
        print()

unusual_dates = ['2019-09-20', '2019-09-19', '2020-01-09', '2020-03-13', '2018-02-07', '2019-03-01']
display_unusual_articles(df, unusual_dates)

Unusual Event Date: 2019-09-20
Article Headlines:
- Justin Trudeau’s blackface can’t be wiped away | Letters
- US briefing: climate strike, Trump whistleblower and Justin Trudeau
- Justin Trudeau's brownface scandal is bad. But voting him out isn't the solution | Moustafa Bayoumi

Unusual Event Date: 2019-09-19
Article Headlines:
- Justin Trudeau brownface: Canada PM apologises after image emerges
- How will Justin Trudeau's blackface photos affect Canada's election?
- US briefing: Greta Thunberg, Justin Trudeau and a Trump whistleblower

Unusual Event Date: 2020-01-09
Article Headlines:
- Justin Trudeau: Canada 'will not rest' until it gets answers about plane crash
- Dear Justin Trudeau, a beard will only make it look like something has gone wrong in your life


Unusual Event Date: 2020-03-13
Article Headlines:
- Justin Trudeau announces sweeping steps to tackle coronavirus in Canada
- Justin Trudeau in self-isolation after wife Sophie tests positive for coronavirus

Unusual Event Da

In [105]:
#! pip install wordcloud
#! pip install matplotlib
! pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (11.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.4.0-py3-none-any.whl (17 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.2/301.2 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0
  Downloading scipy-1.13.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (33.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.7/33.7 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully

In [102]:
def display_event_details(df, unusual_dates):
    df['webPublicationDate'] = pd.to_datetime(df['webPublicationDate'])
    
    for date in unusual_dates:
        start_date = pd.to_datetime(date)
        end_date = start_date
        
        event_articles = df[(df['webPublicationDate'].dt.date >= start_date) &
                            (df['webPublicationDate'].dt.date <= end_date) &
                            (df['type'] == 'article')]
        
        print(f"Event Date: {date}")
        print(f"Total Articles: {len(event_articles)}")
        print("Article Details:")
        
        for _, article in event_articles.iterrows():
            print(f"Date: {article['webPublicationDate'].date()}")
            print(f"Headline: {article['webTitle']}")
            print(f"Section: {article['sectionName']}")
            print(f"URL: {article['webUrl']}")
            print("---")
        
        print()

# Define the list of unusual dates
unusual_dates = ['2019-09-20', '2019-09-19', '2020-01-09', '2020-03-13', '2018-02-07', '2019-03-01']

# Analyze the events for all unusual dates
display_event_details(df, unusual_dates)

Event Date: 2019-09-20
Total Articles: 3
Article Details:
Date: 2019-09-20
Headline: Justin Trudeau’s blackface can’t be wiped away | Letters
Section: World news
URL: https://www.theguardian.com/world/2019/sep/20/justin-trudeau-blackface-cant-be-wiped-away
---
Date: 2019-09-20
Headline: US briefing: climate strike, Trump whistleblower and Justin Trudeau
Section: US news
URL: https://www.theguardian.com/us-news/2019/sep/20/us-briefing-climate-strike-trump-whistleblower-justin-trudeau
---
Date: 2019-09-20
Headline: Justin Trudeau's brownface scandal is bad. But voting him out isn't the solution | Moustafa Bayoumi
Section: Opinion
URL: https://www.theguardian.com/commentisfree/2019/sep/20/justin-trudeau-brownface-scandal
---

Event Date: 2019-09-19
Total Articles: 3
Article Details:
Date: 2019-09-19
Headline: Justin Trudeau brownface: Canada PM apologises after image emerges
Section: World news
URL: https://www.theguardian.com/world/2019/sep/19/justin-trudeau-brownface-canada-pm-apologise


Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior. In a future version these will be considered non-comparable. Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.



In [109]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

def create_word_bubble(df, date):
    # Filter articles by the specified date
    articles = df[(pd.to_datetime(df['webPublicationDate']).dt.date == pd.to_datetime(date).date()) &
                  (df['type'] == 'article')]
    
    # Concatenate the article titles
    text = ' '.join(articles['webTitle'])
    
    # Create a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=None).generate(text)
    
    # Create a trace for the word bubble
    trace = go.Scatter(x=[random.random() for i in range(len(wordcloud.words_))],
                       y=[random.random() for i in range(len(wordcloud.words_))],
                       mode='text',
                       text=[word for word, freq in wordcloud.words_.items()],
                       marker={'opacity': 0.3},
                       textfont={'size': [freq*20 for word, freq in wordcloud.words_.items()]})
    
    # Create the plot layout
    layout = go.Layout(title=f'Word Bubble for {date}',
                       xaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False},
                       yaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False})
    
    # Create the figure
    fig = go.Figure(data=[trace], layout=layout)
    
    return fig

def display_keyword_context(df, date, keyword):
    # Filter articles by the specified date
    articles = df[(pd.to_datetime(df['webPublicationDate']).dt.date == pd.to_datetime(date).date()) &
                  (df['type'] == 'article')]
    
    # Concatenate the article titles
    text = ' '.join(articles['webTitle'])
    
    # Tokenize the text into sentences
    sentences = text.split('.')
    
    # Find the sentences containing the keyword
    keyword_sentences = [sentence for sentence in sentences if keyword.lower() in sentence.lower()]
    
    # Create a DataFrame with the keyword context
    context_df = pd.DataFrame({'Text': keyword_sentences})
    
    # Create a table trace for the keyword context
    table_trace = go.Table(header=dict(values=['Text']), cells=dict(values=[context_df['Text']]))
    
    # Create the plot layout
    layout = go.Layout(title=f'Keyword Context for {date}')
    
    # Create the figure
    fig = go.Figure(data=[table_trace], layout=layout)
    
    return fig

# Define the list of unusual dates
unusual_dates = ['2019-09-20', '2019-09-19', '2020-01-09', '2020-03-13', '2018-02-07', '2019-03-01']

# Create a figure for word bubbles
word_bubble_fig = make_subplots(rows=len(unusual_dates), cols=1, subplot_titles=[f'Word Bubble for {date}' for date in unusual_dates])

# Iterate over the unusual dates
for i, date in enumerate(unusual_dates):
    # Create the word bubble
    word_bubble = create_word_bubble(df, date)
    
    # Add the word bubble trace to the subplot
    word_bubble_fig.add_trace(word_bubble.data[0], row=i+1, col=1)

# Update the layout
word_bubble_fig.update_layout(height=len(unusual_dates) * 500, showlegend=False)

# Show the word bubble plot
word_bubble_fig.show()

# Create a figure for keyword context tables
context_fig = make_subplots(rows=len(unusual_dates), cols=1, subplot_titles=[f'Keyword Context for {date}' for date in unusual_dates])

# Iterate over the unusual dates
for i, date in enumerate(unusual_dates):
    # Display the keyword context
    context_table = display_keyword_context(df, date, 'Trudeau')
    
    # Add the keyword context table to the subplot
    context_fig.add_trace(context_table.data[0], row=i+1, col=1)

# Update the layout
context_fig.update_layout(height=len(unusual_dates) * 500, showlegend=False)

# Show the keyword context plot
context_fig.show()

ValueError: Trace type 'table' is not compatible with subplot type 'xy'
at grid position (1, 1)

See the docstring for the specs argument to plotly.subplots.make_subplots
for more information on subplot types

In [143]:
from src.transformation_analysis import  DataProcessor, DataVisualizer

data_processor = DataProcessor(df)
visualizer = DataVisualizer(df)

article_count, avg_count = data_processor.get_trudeau_articles_count()
total_articles = data_processor.get_total_article_count()

print(f"the article count Data => \n{article_count.head(10)}")
print(f"\n\nthe average article per day is {avg_count:.2f}")
print(f"\n\nTotal articles about Justin Trudeau since 2018-01-01: {total_articles}")

the article count Data => 
        Date  No_of_articles  avg_articles_per_day
0 2018-01-01               0              0.028163
1 2018-01-02               0              0.028163
2 2018-01-03               0              0.028163
3 2018-01-04               0              0.028163
4 2018-01-05               0              0.028163
5 2018-01-06               0              0.028163
6 2018-01-07               0              0.028163
7 2018-01-08               0              0.028163
8 2018-01-09               0              0.028163
9 2018-01-10               0              0.028163


the average article per day is 0.03


Total articles about Justin Trudeau since 2018-01-01: 57


In [144]:
top_sections = data_processor.get_top_section()
print(top_sections)
visualizer.plot_articles_by_section(top_sections)

          sectionName  section_count
0          World news             47
1             Opinion              9
2             US news              3
3         Environment              2
4            Business              1
5          Technology              1
6  Television & radio              1
7      Life and style              1


In [145]:
visualizer.plot_article_by_time(article_count,group_by='year')
visualizer.plot_article_by_time(article_count,group_by='month')
visualizer.plot_article_by_time(article_count, group_by='week')
visualizer.plot_article_by_time(article_count, group_by='day')

In [146]:
unusual_events = data_processor.get_unusual_events(article_count)
print(unusual_events.head(10))
visualizer.plot_unusual_events(article_count)

           Date  No_of_articles  avg_articles_per_day
627  2019-09-20               3              0.028163
626  2019-09-19               3              0.028163
738  2020-01-09               2              0.028163
802  2020-03-13               2              0.028163
37   2018-02-07               2              0.028163
424  2019-03-01               2              0.028163
1284 2021-07-08               1              0.028163
805  2020-03-16               1              0.028163
812  2020-03-23               1              0.028163
886  2020-06-05               1              0.028163


In [147]:
# unusual_dates = ['2019-09-20', '2019-09-19', '2020-01-09', '2020-03-13', '2018-02-07', '2019-03-01']
data_processor.get_articles_unusual_events(unusual_events['Date'].astype(str))

Unusual Event Date: 2019-09-20
Article Headlines:
- Justin Trudeau’s blackface can’t be wiped away | Letters
- US briefing: climate strike, Trump whistleblower and Justin Trudeau
- Justin Trudeau's brownface scandal is bad. But voting him out isn't the solution | Moustafa Bayoumi

Unusual Event Date: 2019-09-19
Article Headlines:
- Justin Trudeau brownface: Canada PM apologises after image emerges
- How will Justin Trudeau's blackface photos affect Canada's election?
- US briefing: Greta Thunberg, Justin Trudeau and a Trump whistleblower

Unusual Event Date: 2020-01-09
Article Headlines:
- Justin Trudeau: Canada 'will not rest' until it gets answers about plane crash
- Dear Justin Trudeau, a beard will only make it look like something has gone wrong in your life


Unusual Event Date: 2020-03-13
Article Headlines:
- Justin Trudeau announces sweeping steps to tackle coronavirus in Canada
- Justin Trudeau in self-isolation after wife Sophie tests positive for coronavirus

Unusual Event Da

In [None]:
import pandas as pd
import duckdb
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import plotly.graph_objects as go

def generate_word_cloud(date, df):
    try:
        articles = df[(df['webPublicationDate'].dt.date == pd.to_datetime(date).date()) & (df['type'] == 'article')]
        
        if not articles.empty:
            if 'bodyText' in articles.columns:
                text = ' '.join(articles['bodyText'].dropna().tolist())
                
                if text.strip():
                    # Remove stopwords and punctuation
                    wordcloud = WordCloud(width=800, height=800, background_color='white', stopwords=None, min_font_size=10).generate(text)
                    
                    plt.figure(figsize=(8, 8), facecolor=None)
                    plt.imshow(wordcloud)
                    plt.axis("off")
                    plt.tight_layout(pad=0)
                    plt.show()
                else:
                    print(f"No text found in the articles for the date: {date}")
            else:
                print(f"Column 'bodyText' not found in the DataFrame for the date: {date}")
        else:
            print(f"No articles found for the date: {date}")
    except Exception as e:
        print(f"An error occurred while generating the word cloud for the date: {date}")
        print(f"Error message: {str(e)}")

def display_context(date, df):
    con = duckdb.connect()
    
    query = """
        SELECT DISTINCT CONCAT('[...]', SUBSTRING(df."bodyText", MAX(0, STRPOS(LOWER(df."bodyText"), LOWER('Trudeau')) - 50), 100), '[...]') AS context
        FROM df
        WHERE df."webPublicationDate"::date = ?::date
        AND df."type" = 'article'
        AND STRPOS(LOWER(df."bodyText"), LOWER('Trudeau')) > 0
    """
    
    result = con.execute(query, [date]).fetchdf()
    
    if not result.empty:
        fig = go.Figure(data=[go.Table(
            header=dict(values=['Context'],
                        fill_color='paleturquoise',
                        align='left'),
            cells=dict(values=[result['context']],
                       fill_color='lavender',
                       align='left'))])
        
        fig.show()
    else:
        print(f"No context found for the date: {date}")

# Convert the 'webPublicationDate' column to datetime if needed
df['webPublicationDate'] = pd.to_datetime(df['webPublicationDate'])

# Example usage
unusual_dates = ['2019-09-20', '2019-09-19', '2020-01-09', '2020-03-13', '2018-02-07', '2019-03-01']

for date in unusual_dates:
    print(f"Unusual Event Date: {date}")
    generate_word_cloud(date, df)
    display_context(date, df)
    print()

Unusual Event Date: 2019-09-20
Column 'bodyText' not found in the DataFrame for the date: 2019-09-20


BinderException: Binder Error: Table "df" does not have a column named "bodyText"
LINE 6:         AND STRPOS(LOWER(df."bodyText"), LOWER('Trudeau')) > 0
                                 ^

In [148]:
unusual_dates = ['2019-09-20', '2019-09-19', '2020-01-09', '2020-03-13', '2018-02-07', '2019-03-01']
data_processor.get_articles_unusual_events(unusual_dates)

Unusual Event Date: 2019-09-20
Article Headlines:
- Justin Trudeau’s blackface can’t be wiped away | Letters
- US briefing: climate strike, Trump whistleblower and Justin Trudeau
- Justin Trudeau's brownface scandal is bad. But voting him out isn't the solution | Moustafa Bayoumi

Unusual Event Date: 2019-09-19
Article Headlines:
- Justin Trudeau brownface: Canada PM apologises after image emerges
- How will Justin Trudeau's blackface photos affect Canada's election?
- US briefing: Greta Thunberg, Justin Trudeau and a Trump whistleblower

Unusual Event Date: 2020-01-09
Article Headlines:
- Justin Trudeau: Canada 'will not rest' until it gets answers about plane crash
- Dear Justin Trudeau, a beard will only make it look like something has gone wrong in your life


Unusual Event Date: 2020-03-13
Article Headlines:
- Justin Trudeau announces sweeping steps to tackle coronavirus in Canada
- Justin Trudeau in self-isolation after wife Sophie tests positive for coronavirus

Unusual Event Da

In [153]:
data_processor.get_unusual_event_details(unusual_events['Date'].astype(str))

Event Date: 2019-09-20
Total Articles: 3
Article Details:
Date: 2019-09-20
Headline: Justin Trudeau’s blackface can’t be wiped away | Letters
Section: World news
URL: https://www.theguardian.com/world/2019/sep/20/justin-trudeau-blackface-cant-be-wiped-away
---
Date: 2019-09-20
Headline: US briefing: climate strike, Trump whistleblower and Justin Trudeau
Section: US news
URL: https://www.theguardian.com/us-news/2019/sep/20/us-briefing-climate-strike-trump-whistleblower-justin-trudeau
---
Date: 2019-09-20
Headline: Justin Trudeau's brownface scandal is bad. But voting him out isn't the solution | Moustafa Bayoumi
Section: Opinion
URL: https://www.theguardian.com/commentisfree/2019/sep/20/justin-trudeau-brownface-scandal
---

Event Date: 2019-09-19
Total Articles: 3
Article Details:
Date: 2019-09-19
Headline: Justin Trudeau brownface: Canada PM apologises after image emerges
Section: World news
URL: https://www.theguardian.com/world/2019/sep/19/justin-trudeau-brownface-canada-pm-apologise


Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior. In a future version these will be considered non-comparable. Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.



In [157]:
data_processor.get_unusual_event_details(unusual_dates)

Event Date: 2019-09-20
Total Articles: 3
Article Details:
Date: 2019-09-20
Headline: Justin Trudeau’s blackface can’t be wiped away | Letters
Section: World news
URL: https://www.theguardian.com/world/2019/sep/20/justin-trudeau-blackface-cant-be-wiped-away
---
Date: 2019-09-20
Headline: US briefing: climate strike, Trump whistleblower and Justin Trudeau
Section: US news
URL: https://www.theguardian.com/us-news/2019/sep/20/us-briefing-climate-strike-trump-whistleblower-justin-trudeau
---
Date: 2019-09-20
Headline: Justin Trudeau's brownface scandal is bad. But voting him out isn't the solution | Moustafa Bayoumi
Section: Opinion
URL: https://www.theguardian.com/commentisfree/2019/sep/20/justin-trudeau-brownface-scandal
---

Event Date: 2019-09-19
Total Articles: 3
Article Details:
Date: 2019-09-19
Headline: Justin Trudeau brownface: Canada PM apologises after image emerges
Section: World news
URL: https://www.theguardian.com/world/2019/sep/19/justin-trudeau-brownface-canada-pm-apologise


Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior. In a future version these will be considered non-comparable. Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.

