# Why different oil articles are selected from before

In [8]:
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm

'''
old : topic allocation results in 2020-11-16
new : oil info in 2023
raw : raw news in 2023, including that not related to oil
lengths are mismatched since July 2007
'''

# 200801
old = pd.read_csv('old.csv')
new = pd.read_csv('new.csv')
raw = pd.read_csv('new_raw.csv')
assert len(old) != len(new)

# 200701
old07 = pd.read_csv('old07.csv')
new07 = pd.read_csv('new07.csv')
raw07 = pd.read_csv('new07_raw.csv')
assert len(old07) == len(new07)

# energy tag list
tag = pd.read_csv('energytag.csv')['energytag'].values
tag = ['N2:'+i for i in tag]

# tag description
n2k = pd.read_excel('N2K_Template_2015-06-08.xls', index_col=0)
newcodes = pd.read_excel('4_NewsCodes_20170927.xls', index_col=0)

In [9]:
def print_def(tagName, df):
    row = df.loc[tagName[3:]]
    try:
        print('* '+tagName[3:]+' '+row['Description']+'\n---- '+row['Topic Definition'])
    except TypeError:
        assert(np.isnan(row['Topic Definition']))
        print('* '+tagName[3:]+' '+row['Description'])
    
def get_topic_def(tags):
    for tagName in tags:
        try:
            print_def(tagName, newcodes)
        except KeyError:
            try:
                print_def(tagName, n2k)
            except KeyError:
                print('* '+tagName[3:]+'\n---- NOT FOUND')

In [10]:
def get_diff_tags(old, new, squeeze=True):
    diffHeadlines = set(old['headline']).difference(new['headline'])
    if diffHeadlines != set():
        diffIdx = raw[raw['headline'].isin(diffHeadlines)].index
        diffSubjects = raw['subject'].iloc[diffIdx]
        tagLists = [list(set(eval(diffSubject)).difference(tag)) for diffSubject in diffSubjects]
        if squeeze:
            return [item for sublist in tagLists for item in sublist if item.startswith('N2:')]
        else:
            return tagLists
        
    else:
        print('Matching headlines')

## 200701

In [11]:
# 200701
print(f'Old length: {len(old07)}; New length: {len(new07)}')
idx = raw07[raw07['Id'].isin(new07['Id'])].index
subs = raw07['subject'].iloc[idx]
oldTags = [item for sublist in subs for item in eval(sublist) if item.startswith('N2:')]
display(Counter(oldTags).most_common(15))
get_topic_def(set(oldTags).intersection(tag))

Old length: 6798; New length: 6798


[('N2:LEN', 6798),
 ('N2:RTRS', 6798),
 ('N2:CRU', 4132),
 ('N2:PROD', 3311),
 ('N2:US', 3133),
 ('N2:ENR', 2799),
 ('N2:ASIA', 2184),
 ('N2:EMRG', 2136),
 ('N2:NGS', 2089),
 ('N2:EUROPE', 1736),
 ('N2:ELG', 1452),
 ('N2:WEU', 1090),
 ('N2:DRV', 905),
 ('N2:NEWS', 888),
 ('N2:MEAST', 864)]

* MOG Gasoline
---- Production, transport, processing, trading / broking, demand for and use of gasoline / petrol.
* CO2 Carbon / Emissions Markets
---- Trading in permits to emit carbon dioxide and other greenhouse gases. Policy, allowances, regulations, monitoring and enforcement governing greenhouse gas emissions as well as related technology in energy intensive industries, including carbon capture and storage.
* CRU Crude Oil
---- Exploration, extraction, transport, processing, trading / broking, demand for and use of crude oil and gas condensate. Organisations associated with crude oil.
* PROD Refined Products
---- Production, transport, processing, trading / broking, demand for and use of refined oil products including fuel oil, naphtha, gasoline, petroleum, gas oil, jet kerosene and petrochemicals. News on refineries and on policy affecting petroleum products, such as new specifications for sulphur content.
* BIOF Biofuels
---- Production, transport, processing, trading / brokin

## 200801

In [12]:
# 200801
print(f'Old length: {len(old)}; New length: {len(new)}')
diffTags = get_diff_tags(old, new)

subs = raw07['subject']
oldTagsRaw = [item for sublist in subs for item in eval(sublist) if item.startswith('N2:')]
subsetTags = [tag for tag in tqdm(diffTags) if tag not in oldTagsRaw]
display(Counter(subsetTags).most_common(15))

get_topic_def(set(diffTags).difference(oldTagsRaw))

Old length: 8038; New length: 7628


100%|████████████████████████████████████| 10896/10896 [01:15<00:00, 143.98it/s]


[('N2:RFOD', 432),
 ('N2:NCYC', 431),
 ('N2:FDRT', 422),
 ('N2:CYCS', 266),
 ('N2:SHOP', 210),
 ('N2:FINS', 145),
 ('N2:DEPT', 142),
 ('N2:BSVC', 127),
 ('N2:INDS', 124),
 ('N2:CEEU', 91),
 ('N2:RETS', 86),
 ('N2:SEEU', 81),
 ('N2:CYCP', 76),
 ('N2:INVS', 69),
 ('N2:RETA', 66)]

* PHON Integrated Telecommunications Services (TRBC)
---- Providers of telecommunication services other than wireless only. Includes integrated providers of both fixed-line and wireless services offering voice, data and high-density data transmission services, as well as Voice over Internet Protocol (VoIP) services.
* OFCE Office Equipment (TRBC)
---- Manufacturers of office technology equipment such as photocopiers, facsimile, calculators, cash registers, bar code readers, electronic tags, coin and currency counting devices, as well as scientific and analytical equipment and precision instruments.
* MTAL Metals & Mining (TRBC)
---- Miners and processors of precious metals and minerals, steel, aluminum and specialty metals and minerals.
* AERO Aerospace & Defense (TRBC)
---- Producers of commercial, military and private aircraft and spacecraft. Includes manufacturers of military equipment, vehicles, explosive ordnance, guidance systems, artillery, ammunition and other related weaponry.