In [4]:
import pandas as pd 
import media_transform_load as md 
import os
import zipfile
tickers = [
    "AMSC",   # American Superconductor
    "NP",     # Neenah Paper / Neenah Inc.
    "EVR",    # Evercore
    "GOOGL",  # Google (Alphabet Class A)
    "GTXI",   # GTx Inc.
    "HLF",    # Herbalife
    "MDRX",   # Veradigm (formerly Allscripts)
    "ORCL",   # Oracle
    "SPPI",   # Spectrum Pharmaceuticals
    "WFC"     # Wells Fargo
]

Read data 

In [5]:
gkg_dir = "media_data/input/raw"
all_data = []

for fname in os.listdir(gkg_dir):
    if fname.endswith(".zip"):
        zip_path = os.path.join(gkg_dir, fname)
        print(f"Processing {fname}...")

        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            for file_in_zip in zip_ref.namelist():
                with zip_ref.open(file_in_zip) as f:
                    # Load only relevant columns
                    df = md.load_gkg_for_insider_trading(f)
                    if df is not None and not df.empty:
                        # Parse fields
                        df = md.parse_gkg_fields(df)
                        # Filter for insider trading relevance
                        df = md.filter_for_insider_trading(df)
                        if not df.empty:
                            # Filter for target companies
                            df = md.filter_by_companies(df, tickers)
                            if not df.empty:
                                all_data.append(df)
combined_df = pd.concat(all_data, ignore_index=True)

Processing 20150219144500.gkg.csv.zip...

Filtering for insider trading relevance...
  ✓ Found 937 potentially relevant articles
  ✓ Filtered from 3,618 total articles (25.9%)
  AMSC   (American Superconductor       ):     0 articles
  NP     (Neenah                        ):     0 articles
  EVR    (Evercore                      ):     1 articles
  GOOGL  (Google                        ):    23 articles
  GTXI   (GTx                           ):     0 articles
  HLF    (Herbalife                     ):     0 articles
  MDRX   (Veradigm                      ):     0 articles
  ORCL   (Oracle                        ):     0 articles
  SPPI   (Spectrum Pharmaceuticals      ):     0 articles
  WFC    (Wells Fargo                   ):     0 articles
Processing 20150221201500.gkg.csv.zip...

Filtering for insider trading relevance...
  ✓ Found 284 potentially relevant articles
  ✓ Filtered from 1,379 total articles (20.6%)
  AMSC   (American Superconductor       ):     0 articles
  NP     (

Read in Columns

In [8]:
combined_df.columns
combined_df.head()

Unnamed: 0,GKGRECORDID,DATE,SourceCommonName,DocumentIdentifier,Themes,Persons,Organizations,V2Tone,GCAM,Quotations,...,Month,Day,Hour,OrgCount,PrimaryOrg,PersonCount,PrimaryPerson,HasFinancialTheme,Ticker,CompanyName
0,20150219144500-213,20150219144500,digitaljournal.com,http://www.digitaljournal.com/pr/2473413,TAX_FNCACT;TAX_FNCACT_MANAGERS;TAX_FNCACT_FUND...,brian ruby;lawrence calcano;michael g ricciard...,evercore private funds group;park hill group;h...,"3.47129506008011,4.40587449933244,0.9345794392...","wc:664,c1.2:3,c12.1:19,c12.10:83,c12.11:2,c12....",,...,2.0,19.0,14.0,8,evercore private funds group,5,brian ruby,True,EVR,Evercore
1,20150219144500-198,20150219144500,businessinsider.com,http://www.businessinsider.com/silicon-valley-...,MANMADE_DISASTER_IMPLIED;,,google;bmw;porsche;twitter;facebook,"0.454840805717999,3.57374918778428,3.118908382...","wc:1318,c1.1:3,c1.2:12,c1.3:3,c12.1:104,c12.10...",,...,2.0,19.0,14.0,5,google,0,,False,GOOGL,Google
2,20150219144500-223,20150219144500,yahoo.com,https://ca.finance.yahoo.com/news/u2-inspired-...,MEDIA_SOCIAL;TAX_WORLDFISH;TAX_WORLDFISH_TOP;P...,flickr peter neill;jeff bezos;rony abovitz,google;facebook;microsoft,"1.37741046831956,3.58126721763085,2.2038567493...","wc:332,c1.4:1,c12.1:27,c12.10:26,c12.12:6,c12....",,...,2.0,19.0,14.0,3,google,3,flickr peter neill,True,GOOGL,Google
3,20150219144500-575,20150219144500,hartlepoolmail.co.uk,http://www.hartlepoolmail.co.uk/news/business/...,SOC_INNOVATION;TAX_FNCACT;TAX_FNCACT_ADVERTISE...,henry faure walker;scott gill;ashley highfield...,google;mediaforce group;facebook,"5.05464480874317,5.60109289617486,0.5464480874...","wc:668,c1.1:2,c1.2:6,c1.3:8,c12.1:51,c12.10:61...",,...,2.0,19.0,14.0,3,google,4,henry faure walker,True,GOOGL,Google
4,20150219144500-592,20150219144500,searchenginewatch.com,http://searchenginewatch.com/sew/how-to/239599...,TAX_FNCACT;TAX_FNCACT_DESIGNER;ECON_WORLDCURRE...,dave davies,google,"0.859453993933266,2.42669362992922,1.567239635...","wc:1843,c1.1:1,c1.2:3,c1.3:2,c1.4:1,c12.1:168,...",197|182||ve always found Web design to be a fu...,...,2.0,19.0,14.0,1,google,1,dave davies,True,GOOGL,Google


Filter

Analyze

In [9]:
combined_df = md.analyze_company_coverage(combined_df)
ts = md.create_company_timeseries(combined_df)
print("\n" + "="*60)
print("✓ FILTERING COMPLETE!")
print("="*60)
print(f"\nDataFrames available:")
print(f"  - df: Full GKG data ({len(df):,} rows)")
print(f"  - df_companies: Filtered company data ({len(combined_df):,} rows)")
print(f"  - ts: Time series by company-date ({len(ts):,} rows)")


      Date Ticker CompanyName       Tone  Polarity      SourceCommonName
2021-02-05  GOOGL      Google -14.678899 16.513761            law360.com
2025-04-11  GOOGL      Google   3.991131 16.407982 securityinfowatch.com
2024-06-05  GOOGL      Google   4.924242 16.287879          albawaba.com
2019-04-09  GOOGL      Google -10.571429 16.285714     dailytrust.com.ng
2016-10-18    WFC Wells Fargo -10.128617 15.916399      counterpunch.org
2017-09-26    WFC Wells Fargo -10.968661 15.242165   wataugademocrat.com
2016-10-01    WFC Wells Fargo -13.043478 15.217391            iheart.com
2016-08-29  GOOGL      Google  -1.680672 15.126050   contacto-latino.com
2018-08-30  GOOGL      Google  -9.200969 15.012107            theiet.org
2018-09-12  GOOGL      Google  -8.000000 15.000000            eadt.co.uk

TIME SERIES DATA
✓ Created time series with 2794 date-company pairs

Sample:
         Date Ticker  ArticleCount      Tone  Polarity  WordCount
0  2015-02-19    EVR             1  3.471295  5.34045

Save for future use

In [10]:
combined_df.to_csv('media_data/gkg_filtered_companies.csv', index=False)
ts.to_csv('media_data/gkg_company_timeseries.csv', index=False)
print(f"\n✓ Saved filtered data to CSV files")


✓ Saved filtered data to CSV files
