In [None]:
import pandas as pd
import glob

# Process data

In [None]:
# function to process all individual csv annotations into one dataframe
def return_df_from_files(casualty_directory):
    df = pd.concat([pd.read_csv(f) for f in glob.glob(casualty_directory+'*.csv')])
    
    # make sure we do not have any duplicate sentences for same articles
    df.drop_duplicates(['article_id', 'sentence'], keep = 'first', inplace=True)
    
    return df
    
file_annotations = return_df_from_files('./fatality_counts/')

In [None]:
# otherwise we can load from saved annotations

annotations_articles = pd.read_csv('./fatality_counts/summary/annotations_articles_20231007-20231202.csv')
annotations_livefeeds = pd.read_csv('./fatality_counts/summary/annotations_livefeed_20231007-20231202.csv')

combined = pd.concat([annotations_articles, annotations_livefeeds, file_annotations])

#drop any duplicate annotations from files
combined.drop_duplicates(['article_title', 'sentence'], keep = 'first', inplace=True)

# set to datetime and filter for period of interest
combined['article_date'] = pd.to_datetime(combined['article_date'])
combined = combined[(combined['article_date'] > '2023-10-06')]

print(combined.shape)

In [None]:
summary = combined.groupby(['article_date', 'category']).agg({'article_id':'count'}).reset_index().pivot(index='article_date', columns='category', values='article_id').reset_index()

for column in ['both', 'israel', 'palestine']:
    if column not in summary:
        summary[column] = 0
    summary[column].fillna(0, inplace=True)

summary['both'].fillna(0, inplace=True)
summary['israel'] = summary['israel'] + summary['both']
summary['palestine'] = summary['palestine'] + summary['both']

summary.head(5)

In [None]:
summary_grouped = summary.groupby(pd.Grouper(key='article_date', freq='7D')).agg({'israel':'sum', 'palestine':'sum'}).reset_index()
summary_grouped = summary_grouped[summary_grouped['article_date'] < '2024-07-01']

summary_grouped.rename(columns={
    'palestine': 'Palestine',
    'israel':'Israel'
}, inplace=True)

summary_grouped.head(5)

# Plot mentions

In [None]:
import plotly.express as px

fig = px.line(summary_grouped, x="article_date", y=summary_grouped.columns[1:3])
fig.update_layout(
    xaxis=dict(showgrid=True, zeroline=False),
    yaxis=dict(showgrid=True, zeroline=False),
    xaxis_title="",
    yaxis_title="",
    legend_title="",
    title="Mentions of Palestinian and Israeli Deaths by the BBC",
    template="presentation",
    plot_bgcolor="wheat",
    paper_bgcolor="wheat",
    width=1400,
    height=800,
    xaxis_range=['2023-09-21','2024-07-21'],
)
fig.show()

# Plot deaths

In [None]:
deaths_palestine = pd.read_json('./casualty_data/casualties_daily.json')
deaths_israel = pd.read_csv('./casualty_data/idf_casualties.csv')

In [None]:
deaths_israel['date'] = pd.to_datetime(deaths_israel['date'])
deaths_palestine['report_date'] = pd.to_datetime(deaths_palestine['report_date'])

In [None]:
deaths_combined = deaths_palestine[['report_date','killed_cum']].merge(deaths_israel[['date','cum_num_deaths']], left_on='report_date', right_on='date', how = 'outer')
deaths_combined.drop(['date'], axis=1, inplace=True)
deaths_combined = deaths_combined[deaths_combined['report_date'] < '2024-07-01'].rename(columns={'killed_cum':'Palestine', 'cum_num_deaths':'Israel'})

In [None]:
deaths_combined['Palestine'] = deaths_combined['Palestine'].fillna(method='bfill')
deaths_combined['Israel'] = deaths_combined['Israel'].fillna(method='bfill')

In [None]:
fig = px.line(deaths_combined, x="report_date", y=deaths_combined.columns[1:3], labels=['1','2'])

fig.update_layout(
    xaxis=dict(showgrid=True, zeroline=False),
    yaxis=dict(showgrid=True, zeroline=False),
    xaxis_title="",
    yaxis_title="",
    legend_title="",
    title="Israeli and Palestinian Deaths (Cumulative)",
    template="presentation",
    plot_bgcolor="wheat",
    paper_bgcolor="wheat",
    width=1200,
    height=800,
    xaxis_range=['2023-09-21','2024-07-21'],
)
fig.show()