In [1]:
!pip install altair -q --user

# GEG EDA

<INTRO STUFF>

## Getting data on Evergiven timeframe

In [1]:
import json
#set params
BEGIN = "2021-03-01"
END = "2021-04-30"

HEIGHT = 400
WIDTH = 400

bq_params = {"BEGIN": BEGIN, "END": END}
bq_params = json.dumps(bq_params, indent=4, sort_keys=True, default=str)

In [8]:
%%bigquery april_events --params {"BEGIN": "2021-03-01", "END": "2021-04-30"}
WITH geg_data AS ((
    SELECT 
    groupId, 
    name, 
    a.entity as stringVal, 
    a.numMentions, 
    a.avgSalience, 
    eventTime, 
    polarity, 
    magnitude, 
    score 
    from (
    	SELECT polarity, 
        magnitude, 
        score, 
        FARM_FINGERPRINT(url) groupId, 
        CONCAT('Entity',entity.type) name, 
        FORMAT_TIMESTAMP("%Y-%m-%dT%X%Ez", date, "UTC") eventTime, 
    	entity.mid mid FROM `gdelt-bq.gdeltv2.geg_gcnlapi`, UNNEST(entities) entity WHERE entity.mid is not null 
       and lang='en' and DATE(date) >= @BEGIN and DATE(date) <= @END
  ) b JOIN (
     SELECT APPROX_TOP_COUNT(entities.name, 1)[OFFSET(0)].value entity, 
        entities.mid mid, sum(entities.numMentions) as numMentions, 
        avg(entities.avgSalience) as avgSalience
      FROM `gdelt-bq.gdeltv2.geg_gcnlapi`, 
     unnest(entities) entities where entities.mid is not null and lang='en' 
     and DATE(date) >= @BEGIN and DATE(date) <= @END group by entities.mid
  ) a USING(mid)))

select * from geg_data


Query complete after 0.00s: 100%|██████████| 9/9 [00:00<00:00, 3853.09query/s]                        
Downloading: 100%|██████████| 70369292/70369292 [00:47<00:00, 1494310.07rows/s]


In [9]:
april_events

Unnamed: 0,groupId,name,stringVal,numMentions,avgSalience,eventTime,polarity,magnitude,score
0,7830976867991912493,EntityLOCATION,Gilbert Station Road,5,0.001257,2021-03-21T11:46:56+00:00,0.0,4.5,0.0
1,3100410252706223831,EntityORGANIZATION,UBS,8848,0.018045,2021-03-21T21:32:33+00:00,0.0,4.5,0.0
2,-2297817263941679196,EntityLOCATION,Los Angeles,205042,0.010001,2021-03-21T00:02:03+00:00,0.0,4.5,0.0
3,-5374852001300381844,EntityOTHER,Kolelas,1338,0.047422,2021-03-21T16:32:38+00:00,0.0,4.5,-0.3
4,-6754983596130560644,EntityOTHER,Kolelas,1338,0.047422,2021-03-21T17:01:45+00:00,0.0,4.5,-0.3
...,...,...,...,...,...,...,...,...,...
70369287,-1191005947168842791,EntityLOCATION,U.S.,4350763,0.016365,2021-03-09T01:18:31+00:00,0.0,7.5,-0.1
70369288,2970384450307155624,EntityLOCATION,U.S.,4350763,0.016365,2021-03-09T16:34:09+00:00,0.0,7.5,-0.4
70369289,4391024216151471084,EntityLOCATION,U.S.,4350763,0.016365,2021-03-09T07:18:18+00:00,0.0,8.0,0.0
70369290,-4657659261277881748,EntityLOCATION,U.S.,4350763,0.016365,2021-03-09T07:46:49+00:00,0.0,8.0,0.0


### Get a count of the entity values for this period

In [10]:
import altair as alt


def return_top_count_categories(data, group_field, depth):
    ### Get the top counts for stuff
    entity_count = data.groupby(group_field).count()
    top_n = entity_count.sort_values('groupId', ascending=False)[0:depth]
    other_count = entity_count.sort_values('groupId', ascending=False)['groupId'][depth+1::].sum()
    top_n = top_n.reset_index()
    if isinstance(group_field, list):
        top_n.append({'stringVal': 'Other', 'name': 'Other', 'eventTime': other_count}, ignore_index=True)
    else:
    	top_n = top_n.append({group_field: 'Other', 'eventTime': other_count}, ignore_index=True)
    return(top_n)                  
                         
def create_entity_count_charts(count_field, data, salience_filter=0, entity_term_filter=None, top_n=20):
    
    data = data[data['avgSalience'] >= salience_filter]
    if entity_term_filter:
        data=data[data[count_field].str.contains(entity_term_filter, case=False)]
    if top_n > 0:
    	entity_count = return_top_count_categories(data, count_field, top_n)
    else:
        entity_count = data.groupby(count_field).count()

    entity_count = entity_count.reset_index()
    entity_count = entity_count.sort_values('eventTime', ascending=False)
    entity_count['count'] = entity_count['eventTime']

    count_chart = alt.Chart(entity_count).mark_bar().encode(
        x={'field': f'{count_field}', 'type': 'ordinal', 'sort': 'y',},
        y={'field': 'count', 'type': 'quantitative'},
        tooltip=[count_field, 'count']
    ).properties(
        title=f"{count_field} Counts for March 2021",
        height=HEIGHT,
        width=WIDTH
    )
    
    return count_chart


# another one for the chart that uses colors for the categories

def create_entity_count_charts_color(data, salience_filter=0, entity_term_filter=None, top_n=20):
    
    data = data[data['avgSalience'] >= salience_filter]
    if entity_term_filter:
        data=data[data['stringVal'].str.contains(entity_term_filter, case=False)]
    if top_n > 0:
        entity_count = return_top_count_categories(data, ['stringVal', 'name'], top_n)
    else:
    	entity_count = data.groupby(['stringVal', 'name']).count()

    entity_count = entity_count.reset_index()
    entity_count = entity_count.sort_values('eventTime', ascending=False)
    entity_count['count'] = entity_count['eventTime']

    count_chart = alt.Chart(entity_count).mark_bar().encode(
        x={'field': 'stringVal', 'type':'ordinal', 'sort': 'y'},
        y={'field': 'count', 'type': 'quantitative'},
        tooltip=['name', 'stringVal', 'count'],
        color='name:N',
    ).properties(
        title=f"Combined Counts for March 2021",
        height=HEIGHT,
        width=WIDTH*2
    )
    
    return count_chart



In [11]:
## EverGiven is EntityOTHER - maybe we monitor that steam?

In [12]:
category_count_chart = create_entity_count_charts('name', april_events, salience_filter=0, entity_term_filter=None, top_n=100)
entity_count_chart = create_entity_count_charts('stringVal', april_events, salience_filter=0, entity_term_filter=None, top_n=50)
top_chart = create_entity_count_charts_color(april_events, salience_filter=0, entity_term_filter=None, top_n=50)
alt.data_transformers.disable_max_rows()

alt.vconcat(
    top_chart,
    alt.hconcat(category_count_chart, entity_count_chart),
)
    

## Timeseries Analysis

In [None]:
april_events[april_events.stringVal == 'Ever Given']

In [14]:
import datetime

import pandas as pd
def create_ts_chart_data(data, filter_val=["Ever Given", "Beverly Hills", "Donald Trump"]):
    if filter_val != None:
    	data = data[data.stringVal.isin(filter_val)]
    data['eventTime'] = pd.to_datetime(data['eventTime']).dt.date
    avg_pol = data.groupby(['eventTime', 'stringVal'])['polarity'].mean()
    avg_mag = data.groupby(['eventTime', 'stringVal'])['magnitude'].mean()
    avg_score = data.groupby(['eventTime', 'stringVal'])['score'].mean()
    count = data.groupby(['eventTime', 'stringVal'])['numMentions'].sum()
    return pd.DataFrame([count, avg_pol, avg_mag, avg_score]).transpose().reset_index()


ts_data = create_ts_chart_data(april_events, filter_val=None)


ts_data['eventTime'] = ts_data['eventTime'].astype('datetime64[ns]')
ts_data

Unnamed: 0,eventTime,stringVal,numMentions,polarity,magnitude,score
0,2021-03-01,#NationalWorkoutBuddyDay,5.0,0.0,5.3,0.10
1,2021-03-01,&nbsp,51.0,0.0,5.4,0.00
2,2021-03-01,'Homework,40.0,0.0,9.7,0.25
3,2021-03-01,'T' For Texas,132.0,0.0,60.6,0.20
4,2021-03-01,...And Call Me Conrad,6.0,0.0,64.0,0.00
...,...,...,...,...,...,...
11881467,2021-04-30,許書桓,4.0,0.0,9.5,-0.40
11881468,2021-04-30,連線商業銀行,6.0,0.0,1.6,0.00
11881469,2021-04-30,陳潤秋,1.0,0.0,2.7,-0.20
11881470,2021-04-30,韓志正,3.0,0.0,4.2,0.00


In [25]:
 ts_data[:1000]

Unnamed: 0,eventTime,stringVal,numMentions,polarity,magnitude,score
0,2021-03-01,#NationalWorkoutBuddyDay,5.0,0.0,5.300000,0.100000
1,2021-03-01,&nbsp,51.0,0.0,5.400000,0.000000
2,2021-03-01,'Homework,40.0,0.0,9.700000,0.250000
3,2021-03-01,'T' For Texas,132.0,0.0,60.600000,0.200000
4,2021-03-01,...And Call Me Conrad,6.0,0.0,64.000000,0.000000
...,...,...,...,...,...,...
995,2021-03-01,AAGES,4.0,0.0,22.950000,0.000000
996,2021-03-01,AAKASH AGARWAL,2.0,0.0,4.500000,0.500000
997,2021-03-01,AAL,52.0,0.0,39.700000,0.000000
998,2021-03-01,AAMI Community Series,198.0,0.0,15.400000,0.133333


In [58]:
## A little more pre-processing - get top 50 events partitioned by date

ts_data_top50 = ts_data.assign(rnk=ts_data
                                   .groupby('eventTime')['numMentions'].rank(method='min', ascending=False)).query('rnk <= 25')

In [59]:
ts_data_top50

Unnamed: 0,eventTime,stringVal,numMentions,polarity,magnitude,score,rnk
1552,2021-03-01,AP,3.320628e+09,0.0,13.157895,-0.157093,9.0
14110,2021-03-01,Australia,1.797192e+09,0.0,11.819173,-0.045149,16.0
27586,2021-03-01,COVID-19,3.386235e+10,0.0,11.823083,-0.064856,2.0
28290,2021-03-01,California,2.386322e+09,0.0,13.851965,-0.066189,12.0
28911,2021-03-01,Canada,2.014735e+09,0.0,13.858203,-0.045467,14.0
...,...,...,...,...,...,...,...
11841118,2021-04-30,Senate,1.365383e+09,0.0,17.029434,-0.172267,18.0
11856785,2021-04-30,Texas,1.245798e+09,0.0,16.734963,-0.081005,21.0
11865737,2021-04-30,U.S.,9.243631e+10,0.0,14.267872,-0.091457,1.0
11866067,2021-04-30,UK,1.175500e+10,0.0,14.513301,-0.078459,4.0


In [60]:
alt.data_transformers.disable_max_rows()

avg_magnitude = alt.Chart(ts_data_top50).mark_line().encode(
    x={"field": "eventTime", 'type': 'temporal'},
    y={'field': 'magnitude', 'type': 'quantitative'},
    tooltip=['eventTime','magnitude', 'stringVal'],
    color='stringVal:N',
).properties(
        title=f"Average Magnitude Score by Entity",
        height=HEIGHT,
        width=WIDTH
    )

num_mentions = alt.Chart(ts_data_top50).mark_line().encode(
    x={"field": "eventTime", 'type': 'temporal'},
    y={'field': 'numMentions', 'type': 'quantitative'},
    tooltip=['eventTime','numMentions', 'stringVal'],
    color='stringVal:N',
).properties(
        title=f"Number of Mentions by Entity",
        height=HEIGHT,
        width=WIDTH
    )

avg_polarity = alt.Chart(ts_data_top50).mark_line().encode(
    x={"field": "eventTime", 'type': 'temporal'},
    y={'field': 'polarity', 'type': 'quantitative'},
    tooltip=['eventTime','polarity', 'stringVal'],
    color='stringVal:N',
).properties(
        title=f"Average Polarity by Entity",
        height=HEIGHT,
        width=WIDTH
    )

avg_score = alt.Chart(ts_data_top50).mark_line().encode(
    x={"field": "eventTime", 'type': 'temporal'},
    y={'field': 'score', 'type': 'quantitative'},
    tooltip=['eventTime','score', 'stringVal'],
    color='stringVal:N',
).properties(
        title=f"Avgerage Score by Entity",
        height=HEIGHT,
        width=WIDTH
    )

alt.vconcat(
    alt.hconcat(num_mentions, avg_magnitude),
    alt.hconcat(avg_polarity, avg_score),
)

In [41]:
w=april_events[april_events['stringVal'].lower == "tom petty"]
w

AttributeError: 'Series' object has no attribute 'lower'

In [63]:
zz=april_events[april_events['stringVal'].str.contains("z")]

In [64]:
zz

Unnamed: 0,groupId,name,stringVal,numMentions,avgSalience,eventTime,polarity,magnitude,score
31,-3105079244623794583,EntityPERSON,Queen Elizabeth,6.0,0.880214,2021-03-01T07:16:47+00:00,0.0,2.0,0.2
33,6107597291347116114,EntityPERSON,Queen Elizabeth,6.0,0.880214,2021-03-01T14:33:18+00:00,0.0,2.0,0.2
35,-1480449247246516428,EntityPERSON,Queen Elizabeth,6.0,0.880214,2021-03-01T03:32:30+00:00,0.0,2.0,0.2
37,8228111562446977735,EntityPERSON,Queen Elizabeth,6.0,0.880214,2021-03-01T12:32:36+00:00,0.0,2.0,0.2
39,-5462915138951645714,EntityPERSON,Queen Elizabeth,6.0,0.880214,2021-03-01T08:02:55+00:00,0.0,2.0,0.2
...,...,...,...,...,...,...,...,...,...
4328367,-5085596850111108760,EntityORGANIZATION,World Health Organization,4.0,0.851978,2021-03-18T08:19:06+00:00,0.0,1.8,0.0
4328368,-35215210535195339,EntityORGANIZATION,World Health Organization,4.0,0.851978,2021-03-18T01:17:51+00:00,0.0,1.8,-0.2
4328369,2322684724690910539,EntityORGANIZATION,World Health Organization,4.0,0.851978,2021-03-18T22:32:15+00:00,0.0,1.8,0.0
4328370,8993312145740526654,EntityORGANIZATION,World Health Organization,4.0,0.851978,2021-03-18T17:01:52+00:00,0.0,1.8,-0.3


In [112]:
### Get the top counts for stuff

entity_count = zz.groupby("stringVal").count()
top_n = entity_count.sort_values('stringVal', ascending=False)[0:20]
other_count = entity_count.sort_values('groupId', ascending=False)['groupId'][21::].sum()
top_n = top_n.reset_index()
top_n = top_n.append({'stringVal': 'Other', 'groupId': other_count}, ignore_index=True)

In [145]:
#debug
data = zz
### Get the top counts for stuff
entity_count = data.groupby('stringVal').count()
top_n = entity_count.sort_values('groupId', ascending=False)[0:20]
other_count = entity_count.sort_values('groupId', ascending=False)['groupId'][20+1::].sum()
top_n = top_n.reset_index()

In [147]:
top_n

Unnamed: 0,stringVal,groupId,name,numMentions,avgSalience,eventTime,polarity,magnitude,score
0,World Health Organization,43845,43845,43845,43845,43845,43845,43845,43845
1,Amazon,38094,38094,38094,38094,38094,38094,38094,38094
2,Queen Elizabeth,12228,12228,12228,12228,12228,12228,12228,12228
3,Ted Cruz,5963,5963,5963,5963,5963,5963,5963,5963
4,Southern Azerbaijan,4478,4478,4478,4478,4478,4478,4478,4478
5,Jennifer Lopez,3578,3578,3578,3578,3578,3578,3578,3578
6,Alexandria Ocasio-Cortez,3462,3462,3462,3462,3462,3462,3462,3462
7,Riz Ahmed,2720,2720,2720,2720,2720,2720,2720,2720
8,Nazanin Zaghari-Ratcliffe,1808,1808,1808,1808,1808,1808,1808,1808
9,Arnold Schwarzenegger,1690,1690,1690,1690,1690,1690,1690,1690


AttributeError: 'DataFrame' object has no attribute 'type'