In [17]:
import pandas as pd
import matplotlib.pyplot as plt

DATA_DIR = '../data/'

In [None]:
search_df = pd.read_csv(DATA_DIR+"best_search_results.csv")
timeline_df = pd.read_csv(DATA_DIR+"ACLED_Ukraine_events_timeline.csv")
events_df = pd.read_csv(DATA_DIR+"ACLED_Ukraine_2013-11-01-2024-12-16.csv")

In [60]:
# create new column "timeline_id" that is all the numbers from the search_id column
search_df['timeline_id'] = search_df['search_id'].str.extract('(\d+)')

# create new binary column "missing" that is 1 if either before_image_id or after_image_id is missing
search_df['missing'] = search_df['before_image_id'].isnull() | search_df['after_image_id'].isnull()

# create new column "timeline_id" that is all the numbers from the search_id column
search_df['timeline_id'] = search_df['search_id'].str.extract('(\d+)').astype(int)

# create new column "full clarity" that is 1 if both before_agg_clear and after_agg_clear are between 0.95 and 1
search_df['full_clarity'] = (search_df['before_agg_clear'] >= 0.95) & (search_df['after_agg_clear'] >= 0.95)

In [63]:
# merge timeline_df with search_df on timeline_id
# define relevant columns
relevant_cols_timeline_df = ['timeline_id', 'event_date', "any_event", "event_id_cnty"]
relevant_cols_events_df = ['event_id_cnty', 'event_type', 'sub_event_type']

merged_df = pd.merge(search_df, timeline_df[relevant_cols_timeline_df], on='timeline_id', how='left').merge(events_df[relevant_cols_events_df], on='event_id_cnty', how='left')

In [None]:
# extract month from event_date
merged_df['event_date'] = pd.to_datetime(merged_df['event_date'])
merged_df['month'] = merged_df['event_date'].dt.to_period('M')

# aggregate by month and plot the share of events with missing images
missing_share = merged_df.groupby('month')['missing'].mean()
# plot the share of events with missing images, make the x axis long enough to show all months
missing_share.plot(kind='bar', figsize=(20,10))

In [None]:
# aggregate by month and plot the number of rows where any_event is 1 and those where it is 0
any_event = merged_df.groupby(['month', 'any_event']).size().unstack()
any_event.plot(kind='bar', figsize=(20,10))

In [None]:
# extract month from event_date
timeline_df['event_date'] = pd.to_datetime(timeline_df['event_date'])
timeline_df['month'] = timeline_df['event_date'].dt.to_period('M')

# aggregate by month and plot the number of rows where any_event is 1
any_event = timeline_df.groupby('month')['any_event'].sum()
# plot the number of rows where any_event is 1
any_event.plot(kind='bar', figsize=(20,10))

In [None]:
merged_df.groupby('any_event')["missing"].value_counts()

In [None]:
# plot histogram of before_agg_clear
merged_df['before_agg_clear'].hist()

In [None]:
# plot histogram of before_agg_clear
merged_df['after_agg_clear'].hist()

In [None]:
# count values of sub_event_type where missing is False
merged_df[merged_df['missing'] == False].groupby("full_clarity")['sub_event_type'].value_counts()