In [2]:
import pandas as pd
from datetime import datetime
from transformers import AutoTokenizer, set_seed, pipeline
from datasets import Dataset
from tqdm.auto import tqdm
from collections import Counter, defaultdict
import networkx as nx
from transformers.pipelines.pt_utils import KeyDataset
from networkx.algorithms.dag import descendants, ancestors
import sys
import os
sys.path.append(os.path.abspath('../../modules'))
from experiment_1.RoBERTaEntity import RoBERTaEntity

In [3]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenizer.add_special_tokens({"additional_special_tokens": ["__NE_FROM__", "__NE_TO__", "__NE_OTHER__"]})

3

In [4]:
df = pd.read_json("mining_processed.json", convert_dates=["date"])
df = df[df['date'] >= datetime(2021, 1, 1)]

In [6]:
ds = Dataset.from_pandas(df)
def tokenize(examples):
    tokens = tokenizer(examples['masked_sentence'], truncation=False, padding='max_length')
    if len(tokens["input_ids"]) <= 512:
        return examples
    else:
        print(examples['masked_sentence'])
ds = ds.filter(tokenize, batched=False)
ds

Filter:   0%|          | 0/85971 [00:00<?, ? examples/s]

Dataset({
    features: ['title', 'date', 'link', 'sentence', 'from', 'to', 'masked_sentence'],
    num_rows: 85971
})

In [8]:
set_seed(42)
model = RoBERTaEntity.from_pretrained("../../CaseStudyModel", local_files_only=True, num_labels=5) #, id2label=id2label, label2id=label2id
model.resize_token_embeddings(len(tokenizer))
model = model.to("cuda")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'RoBERTaEntity' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertFor

In [9]:
KeyDataset(ds, "masked_sentence")

<transformers.pipelines.pt_utils.KeyDataset at 0x259e98cb790>

In [10]:
labels = []
for out in tqdm(pipe(KeyDataset(ds, "masked_sentence"), batch_size=512), total=len(ds)):
    if out['score'] >= 0.8:
        labels.append(out['label'])
    else:
        labels.append(None)
    #labels.extend([x['label'] for x in out])

  0%|          | 0/85971 [00:00<?, ?it/s]

In [11]:
df = ds.to_pandas()

In [12]:
df['result'] = labels

In [13]:
scm_result = defaultdict(lambda: [])
for i, row in df.iterrows():
    if row['from'] == "Tesla Inc":
        row['from'] = "Tesla"
    if row['to'] == "Tesla Inc":
        row['to'] = "Tesla"
    if 'Contemporary Amperex Technology' in row['from']:
        row['from'] = "CATL"
    if 'Contemporary Amperex Technology' in row['to']:
        row['to'] = "CATL"
    if row['from'] == row['to']:
        continue
    if row['result'] is None:
        continue
    if row['result'] not in ['B_supplies_A', 'A_supplies_B']:
       continue
    swap = row['from'] > row['to']
    if swap:
        if row['result'] == 'B_supplies_A':
            scm_result[(row['to'], row['from'])].append('A_supplies_B')
        else:
            scm_result[(row['to'], row['from'])].append('B_supplies_A')
    else:
        if row['result'] == 'B_supplies_A':
            scm_result[(row['from'], row['to'])].append('B_supplies_A')
        else:
            scm_result[(row['from'], row['to'])].append('A_supplies_B')

In [14]:
supplier_buyer_result = []
for key, value in scm_result.items():
    if 'A_supplies_B' in set(value):
        supplier_buyer_result.append(key)
    elif 'B_supplies_A' in set(value):
        supplier_buyer_result.append((key[1], key[0]))

In [15]:
G = nx.DiGraph(supplier_buyer_result)
largest_strongly_connected_component = max(nx.strongly_connected_components(G), key=len)
largest_strong_subgraph = G.subgraph(largest_strongly_connected_component)
largest_weakly_connected_component = max(nx.weakly_connected_components(G), key=len)
largest_weak_subgraph = G.subgraph(largest_weakly_connected_component)
size_strong = len(largest_strong_subgraph.nodes())
size_weak = len(largest_weak_subgraph.nodes())
graphml_file_path = 'RoBERTA_mining.graphml'
nx.write_graphml(largest_weak_subgraph, graphml_file_path)
selected_node = "Tesla"
ancestor_nodes = list(ancestors(G, selected_node))
descendant_nodes = list(descendants(G, selected_node))
mini_graph = G.subgraph(ancestor_nodes + descendant_nodes + [selected_node])
nx.write_graphml(mini_graph, "RoBERTA_tesla_mining.graphml")