#### Dataset Creation

In [1]:
from utils import set_seed
from data_loading.models_dataset import ArchiMateDataset, EcoreDataset, OntoUMLDataset

set_seed(42)

config_params = dict(
    # reload=True,
    min_enr = -1,
    min_edges = 10,
    # language = 'en',
)

dataset = ArchiMateDataset('eamodelset', **config_params)
dataset = EcoreDataset('modelset', **config_params)
dataset = OntoUMLDataset('ontouml', **config_params)

Loading eamodelset from pickle
Loaded eamodelset with 558 graphs
Loaded eamodelset with 558 graphs
Graphs: 558
Loading modelset from pickle
Loaded modelset with 2539 graphs
Loaded modelset with 2539 graphs
Loading ontouml from pickle
Loaded ontouml with 175 graphs
Loaded ontouml with 175 graphs
Graphs: 175


In [2]:
dataset.summary

{'num_graphs': 175,
 'num_edges': 20220,
 'num_nodes': 15890,
 'average_nodes': '90.80',
 'average_edges': '115.54',
 'average_n2e_ratio': '0.83'}

In [3]:
from data_loading.graph_dataset import GraphNodeDataset
import utils

utils.set_seed(42)

graph_data_params = dict(
    reload=True,
    test_ratio=0.2,
    # add_negative_train_samples=True,
    # neg_sampling_ratio=1,
    distance=1,
    random_embed_dim=128,
    use_attributes=False,
    use_edge_label=True,
    use_edge_types=True,
    use_node_types=True,
    
    node_cls_label='stereotype',
    # use_special_tokens=True,
    # task_type='graph_cls',
    # use_embeddings=True,
    # embed_model_name='bert-base-cased',
    # ckpt='results/eamodelset/lp/10_att_0_nt_0/checkpoint-177600',
    limit = -1,
)

print("Loading graph dataset")
graph_node_dataset = GraphNodeDataset(dataset, **graph_data_params)
print("Loaded graph dataset")

texts = graph_node_dataset.get_node_classification_texts(distance=1, label='stereotype')

Loading graph dataset
Number of duplicate graphs:  1


Creating node graphs:   0%|          | 0/175 [00:00<?, ?it/s]

Embedding graphs:   0%|          | 0/174 [00:00<?, ?it/s]

Re-Loading graphs:   0%|          | 0/174 [00:00<?, ?it/s]

['' 'abstract' 'abstract individual' 'activity' 'agent' 'atomic event'
 'being present at ' 'belief' 'bringsabout' 'category' 'causal'
 'characterization' 'collective' 'commitment' 'comparative'
 'complex event' 'complexaction' 'complexevent' 'componentof' 'constitute'
 'cr' 'crd' 'creation' 'cru' 'crud' 'datatype' 'derivation' 'disposition'
 'endurant' 'enumeration' 'event' 'externaldependence' 'formal' 'goal'
 'has part' 'historicaldependence' 'historicalrole' 'historicalrolemixin'
 'humanagent' 'induces' 'instantiation' 'institutionalagent' 'intention'
 'internal' 'kind' 'manifestation' 'material' 'material relation'
 'mediation' 'memberof' 'mentalmode' 'mixin' 'mode' 'natural'
 'nonperceivablequality' 'normative description' 'object' 'organization'
 'part-of' 'participation' 'participational' 'partof' 'phase' 'phasemixin'
 'pos-state' 'post state' 'pre-state' 'presentat' 'processual role'
 'proposition' 'quale' 'quality' 'quality dimension' 'quality structure'
 'quantity' 'r' 'rela

Validating node classes:   0%|          | 0/174 [00:00<?, ?it/s]

Train classes: {0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96}
Test classes: {0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 39, 40, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 54, 55, 57, 59, 60, 61, 62, 63, 66, 67, 68, 69, 71, 73, 74, 75, 76, 77, 78, 79, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93}
Number of classes in training set: 86
Number of classes in test set: 74
Edge Classes:  [None]
Graphs saved
Node label: stereotype
Train Node classes: {59: 183, 86: 1209, 77: 913, 30: 377, 71: 214, 11: 397, 22: 47, 60: 16, 0: 2635, 44: 1144, 76: 556, 62: 255, 48: 1092, 9: 414, 18: 394, 32: 294, 78: 316, 52: 314, 88: 15, 12: 121, 46: 465, 31: 63,

Getting node classification data:   0%|          | 0/174 [00:00<?, ?it/s]

Tokenizing data
['Source Rock  stereotype: kind Lithologic Unit qua shale', 'Oil  stereotype: quantity Hidrocarbon\nOil\nOil\nOil', 'stereotype: event Acumulation', 'Migration  stereotype: pos-state', 'Hidrocarbon', 'Generation  stereotype: pos-state', 'Trap', 'Oil + Porous rock  stereotype: pre-state', 'Shale  stereotype: quantity Siliciclastic Rock', 'Structure']
['Seal  stereotype: kind Lithologic Unit qua shale\nSeal', 'stereotype: quantity Siliciclastic Rock', 'Sandstone  stereotype: quantity Siliciclastic Rock', 'stereotype: quantity Shale', 'Boundary Type', 'Value', 'stereotype: quantity Siliciclastic Rock', 'Lithologic Unit qua sandstone  stereotype: kind Lithological unit\nLithologic Unit qua sandstone\nLithologic Unit qua sandstone\nLithologic Unit qua sandstone  stereotype: constitute', 'Business Need  stereotype: material is Demanded By', 'End Product  Value Object\nEnd Product  has Objective Value\nEnd Product  satisfies\nEnd Product  has Objective Value\nEnd Product  stoc

In [None]:
from tokenization.utils import get_tokenizer


tokenizer = get_tokenizer('bert-base-uncased')
for data in graph_node_dataset.get_kfold_lm_graph_classification_data(tokenizer):
    break

In [1]:
def get_data(fname):
	with open(fname) as f:
		data = f.read().split('\n')
		texts, labels = [], []
		for line in data:
			if not line:
				continue
			try:
				texts.append(line.split(", Text: ")[1])
				labels.append(line.split(", Text: ")[0].split("Label: ")[1])
			except Exception as e:
				print(line)
				raise e
	return texts, labels

X_train, y_train = get_data('train.txt')
X_test, y_test = get_data('test.txt')

In [2]:
X, y = X_train + X_test, y_train + y_test
len(X), len(y)

(1606, 1606)

In [5]:
from collections import Counter


print(len(X_train), len(X_test), len(y_train), len(y_test))
print(Counter(y_train))
print(Counter(y_test))

1284 322 1284 322
Counter({'statemachine': 93, 'gpl': 84, 'class-diagram': 74, 'modelling': 64, 'simple-pl': 59, 'iot': 51, 'workflow': 48, 'relational': 47, 'transformation': 45, 'petrinet': 44, 'metamodelling': 37, 'robots': 34, 'webapp': 33, 'features': 32, 'education': 29, 'library': 24, 'constraints': 24, 'graphicaleditor': 23, 'visualization': 21, 'components': 20, 'expressions': 19, 'types': 18, 'entities': 17, 'services': 16, 'trace': 14, 'company': 14, 'mvc': 14, 'publication': 13, 'enterprisearchitecture': 13, 'architecture': 12, 'forms': 12, 'calculator': 12, 'drones': 12, 'metrics': 11, 'testing': 11, 'hotels': 11, 'purchases': 11, 'app': 11, 'relationships': 10, 'embedded': 10, 'modelmanagement': 9, 'families': 9, 'cloud': 9, 'projectplanning': 9, 'html': 8, 'graph': 8, 'automata': 7, 'rental': 7, 'interaction': 7, 'gui': 6, 'textprocessing': 5, 'termrewriting': 5, 'softwarerepository': 5, 'configuration': 5, 'requirements': 5, 'tournament': 5, 'activities': 5, 'railway': 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import balanced_accuracy_score, classification_report


pipeline = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'), verbose=True)

print("Fitting SVM classifier")
# Train the model
pipeline.fit(X_train, y_train)

print("Predicting")
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))
balanced_accuracy_score(y_test, y_pred)

Fitting SVM classifier
[Pipeline] ... (step 1 of 2) Processing tfidfvectorizer, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing svc, total=   1.2s
Predicting
                        precision    recall  f1-score   support

            activities       0.00      0.00      0.00         4
                   app       0.50      0.50      0.50         2
          architecture       1.00      0.50      0.67         2
              automata       1.00      0.25      0.40         4
          bibliography       1.00      1.00      1.00         1
            calculator       1.00      0.50      0.67         2
         class-diagram       0.88      1.00      0.93        21
               company       1.00      1.00      1.00         3
            components       0.75      1.00      0.86         3
           constraints       0.75      1.00      0.86         6
                drones       0.50      1.00      0.67         1
             education       1.00      1.00      1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.7139958968906337

In [9]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from transformers import (
    Trainer, 
    TrainingArguments
)
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer
)
from data_loading.encoding import EncodingDataset
from settings import device
from sklearn.preprocessing import LabelEncoder
from trainers.metrics import compute_metrics


class BertTrainer:
    def __init__(
        self,
        model_name,
        ckpt=None,
        max_length=512
    ):
        self.model_name = model_name
        self.ckpt = ckpt
        self.max_length = max_length


    def train(
        self,
        texts,
        labels,
        test_ratio=0.2,
        kfold=False,
        num_train_epochs=15,
        train_batch_size=2,
        eval_batch_size=128,
        weight_decay=0.01,
        logging_steps=50,
        eval_steps=50,
        save_steps=50,
        learning_rate=5e-5,
        warmup_steps=500,
        output_dir='./results',
        logs_dir='./logs',
        seed=42
    ):
        def train_fold():
            print(f'Train: {len(X_train)}, Test: {len(X_test)}')
            print("Class distribution in train: ", Counter(y_train))
            print("Class distribution in test: ", Counter(y_test))

            tokenizer = AutoTokenizer.from_pretrained(self.model_name if not self.ckpt else self.ckpt)
            model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=num_classes)
            model.to(device)

            train_ds = EncodingDataset(tokenizer, X_train, y_train, max_length=self.max_length)
            test_ds = EncodingDataset(tokenizer, X_test, y_test, max_length=self.max_length)

            training_args = TrainingArguments(
                output_dir=output_dir,
                num_train_epochs=num_train_epochs,
                eval_strategy="steps",
                per_device_train_batch_size=train_batch_size,
                per_device_eval_batch_size=eval_batch_size,
                warmup_steps=warmup_steps,
                weight_decay=weight_decay,
                learning_rate=learning_rate,
                logging_dir=logs_dir,
                logging_steps=logging_steps,
                eval_steps=eval_steps,
                save_steps=save_steps,
                save_total_limit=2,
                load_best_model_at_end=True,
                fp16=True
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=test_ds,
                compute_metrics=compute_metrics            
            )

            trainer.train()
            results = trainer.evaluate()
            print(results)


        y = LabelEncoder().fit_transform(labels)
        num_classes = len(set(y))
        if kfold > 0:
            k = int(1 / self.test_ratio)
            kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
            n = len(self.graphs)
            for i, (train_idx, test_idx) in enumerate(kfold.split(np.zeros(n), np.zeros(n))):
                X_train, y_train = [texts[i] for i in train_idx], [y[i] for i in train_idx]
                X_test, y_test = [texts[i] for i in test_idx], [y[i] for i in test_idx]
                print("Fold number: ", i+1)
                train_fold()
        else:
            X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=test_ratio, random_state=seed)
            train_fold()

In [10]:
bert_trainer = BertTrainer('bert-base-uncased')
bert_trainer.train(X, y, test_ratio=0.2)

Train: 1284, Test: 322
Class distribution in train:  Counter({23: 92, 51: 89, 6: 75, 34: 66, 49: 62, 64: 48, 38: 46, 58: 45, 43: 43, 30: 41, 13: 39, 32: 36, 25: 31, 63: 30, 21: 26, 47: 25, 60: 24, 17: 23, 11: 22, 31: 22, 18: 20, 62: 19, 16: 17, 8: 16, 36: 15, 57: 14, 9: 14, 48: 13, 33: 13, 40: 13, 5: 12, 53: 11, 27: 11, 44: 11, 3: 10, 19: 10, 35: 10, 41: 10, 2: 9, 22: 9, 7: 9, 1: 9, 39: 9, 24: 9, 28: 8, 29: 8, 0: 8, 12: 8, 45: 7, 14: 7, 55: 6, 26: 6, 46: 6, 42: 6, 56: 5, 54: 5, 10: 4, 20: 4, 61: 4, 52: 4, 4: 3, 50: 3, 37: 2, 59: 1, 15: 1})
Class distribution in test:  Counter({51: 26, 6: 20, 43: 17, 30: 16, 49: 15, 47: 14, 64: 12, 58: 12, 63: 11, 34: 11, 32: 10, 21: 10, 38: 10, 9: 9, 11: 8, 29: 6, 25: 6, 13: 6, 23: 6, 44: 5, 57: 5, 12: 5, 19: 5, 2: 5, 62: 5, 48: 4, 16: 4, 14: 4, 22: 4, 27: 4, 1: 4, 31: 4, 40: 3, 41: 3, 28: 2, 33: 2, 37: 2, 50: 2, 26: 2, 18: 2, 36: 2, 5: 2, 53: 2, 52: 2, 24: 2, 10: 1, 35: 1, 3: 1, 39: 1, 46: 1, 8: 1, 17: 1, 4: 1, 20: 1, 0: 1, 42: 1})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Encoding Dataset created with 1284 samples
Encoding Dataset created with 322 samples




Step,Training Loss,Validation Loss,Accuracy,Balanced Accuracy
50,4.1737,4.152552,0.037267,0.022186
100,4.0296,3.937557,0.090062,0.048008
150,3.8071,3.693329,0.177019,0.066659
200,3.5356,3.399808,0.251553,0.126848
250,3.1988,3.04043,0.403727,0.209241
300,2.8645,2.775781,0.406832,0.228524
350,2.5558,2.431179,0.543478,0.364319
400,2.1713,2.107372,0.583851,0.392884
450,1.9094,1.88831,0.618012,0.446488
500,1.6489,1.677718,0.664596,0.510141




{'eval_loss': 0.740541398525238, 'eval_accuracy': 0.860248447204969, 'eval_balanced_accuracy': 0.796237088949799, 'eval_runtime': 0.7368, 'eval_samples_per_second': 436.997, 'eval_steps_per_second': 1.357, 'epoch': 15.0}




In [10]:
from gensim.models import Word2Vec

data = graph_node_dataset.get_link_prediction_texts(label='type')
texts = sum([v for k, v in data.items() if not k.endswith("classes")], [])
sentences = [text.split() for text in texts]

Getting edge_cls data:   0%|          | 0/543 [00:00<?, ?it/s]

Train Texts:  ['<node_begin>entity 2<node_end> <edge_begin><edge_end> <node_begin>aggregate 2<node_end><edge_begin><edge_end><node_begin>entity 1<node_end> <edge_begin><edge_end> <node_begin>aggregate 1<node_end>\n<node_begin>entity 1<node_end> <edge_begin><edge_end> <node_begin>entity 3<node_end>', '<node_begin>aggregate 3<node_end> <edge_begin><edge_end> <node_begin>aggregate 5<node_end>\n<node_begin>aggregate 3<node_end> <edge_begin><edge_end> <node_begin>aggregate 2<node_end><edge_begin><edge_end><node_begin>service 1<node_end>', '<node_begin>entity 6<node_end><edge_begin><edge_end><node_begin>aggregate 6<node_end>', '<node_begin>Bounded context 1<node_end> <edge_begin><edge_end> <node_begin>entity 6<node_end>\n<node_begin>Bounded context 1<node_end> <edge_begin><edge_end> <node_begin>core domain<node_end>\n<node_begin>Bounded context 1<node_end> <edge_begin><edge_end> <node_begin>entity 4<node_end>\n<node_begin>Bounded context 1<node_end> <edge_begin><edge_end> <node_begin>The ker

In [13]:
len(sentences)

70471

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer()
model.fit(texts)

In [17]:
t = model.transform(texts)[0]

In [None]:
from scipy.sparse import csr_matrix
type(t), isinstance(t, csr_matrix)

(scipy.sparse._csr.csr_matrix, True)

In [21]:
import numpy as np


isinstance(t, np.ndarray)

False

In [6]:
from data_loading.graph_dataset import GraphNodeDataset

graph_data_params = dict(
    reload=True,
    test_ratio=0.2,
    distance=1,
    random_embed_dim=1,
    use_attributes=True,
    use_node_types=True,
    use_edge_label=True,
    use_edge_types=True,
    use_special_tokens=True,
    # use_embeddings=True,
    # embed_model_name='bert-base-cased',
    # ckpt='results/eamodelset/lp/10_att_0_nt_0/checkpoint-177600',
    node_cls_label='type',
)

    # graph_data_params = dict(
    #     distance=args.distance,
    #     reload=args.reload,
    #     test_ratio=args.test_ratio,
    #     use_attributes=args.use_attributes,
    #     use_node_types=args.use_node_types,
    #     use_edge_types=args.use_edge_types,
    #     use_edge_label=args.use_edge_label,
    #     use_special_tokens=args.use_special_tokens,
    #     no_labels=args.no_labels,
    #     node_cls_label=args.cls_label,
    #     use_embeddings=args.use_embeddings,
    #     embed_model_name=args.embed_model_name,
    #     ckpt=args.ckpt,
    # )


print("Loading graph dataset")
graph_node_dataset = GraphNodeDataset(dataset, **graph_data_params)
print("Loaded graph dataset")
data = graph_node_dataset.get_node_classification_texts()

Loading graph dataset


Embedding node graphs:   0%|          | 0/558 [00:00<?, ?it/s]

Creating graphs:   0%|          | 0/558 [00:00<?, ?it/s]

['AndJunction' 'ApplicationCollaboration' 'ApplicationComponent'
 'ApplicationEvent' 'ApplicationFunction' 'ApplicationInteraction'
 'ApplicationInterface' 'ApplicationProcess' 'ApplicationService'
 'Artifact' 'Assessment' 'BusinessActor' 'BusinessCollaboration'
 'BusinessEvent' 'BusinessFunction' 'BusinessInteraction'
 'BusinessInterface' 'BusinessObject' 'BusinessProcess' 'BusinessRole'
 'BusinessService' 'Capability' 'CommunicationNetwork' 'Constraint'
 'Contract' 'CourseOfAction' 'DataObject' 'Deliverable' 'Device'
 'DistributionNetwork' 'Driver' 'Equipment' 'Facility' 'Gap' 'Goal'
 'Grouping' 'ImplementationEvent' 'Junction' 'Location' 'Material'
 'Meaning' 'Node' 'OrJunction' 'Outcome' 'Path' 'Plateau' 'Principle'
 'Product' 'Representation' 'Requirement' 'Resource' 'Stakeholder'
 'SystemSoftware' 'TechnologyCollaboration' 'TechnologyEvent'
 'TechnologyFunction' 'TechnologyInteraction' 'TechnologyInterface'
 'TechnologyProcess' 'TechnologyService' 'Value' 'ValueStream'
 'WorkPack

Validating node classes:   0%|          | 0/558 [00:00<?, ?it/s]

Train classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}
Test classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}
Number of classes in training set: 64
Number of classes in test set: 64
['application' 'business' 'implementation_migration' 'motivation' 'other'
 'strategy' 'technology' None]
Setting num_nodes_ layer 7


Validating node classes:   0%|          | 0/558 [00:00<?, ?it/s]

Train classes: {0, 1, 2, 3, 4, 5, 6, 7}
Test classes: {0, 1, 2, 3, 4, 5, 6, 7}
Number of classes in training set: 8
Number of classes in test set: 8
Edge Classes:  ['Access' 'Aggregation' 'Assignment' 'Association' 'Composition' 'Flow'
 'Influence' 'Realization' 'Serving' 'Specialization' 'Triggering']
Node label: type
Train Node classes: {6: 708, 26: 2661, 4: 2366, 7: 1484, 2: 3827, 11: 1376, 8: 1821, 41: 1248, 21: 1632, 40: 138, 35: 2259, 17: 3385, 49: 1308, 60: 187, 3: 251, 20: 1324, 14: 1490, 25: 574, 18: 2667, 12: 199, 9: 880, 61: 152, 47: 486, 28: 427, 10: 566, 51: 461, 43: 370, 50: 195, 23: 229, 38: 171, 30: 416, 46: 789, 62: 316, 19: 1139, 52: 1167, 54: 52, 59: 860, 57: 282, 55: 258, 37: 205, 48: 365, 16: 2113, 34: 662, 15: 157, 13: 480, 22: 280, 24: 112, 5: 86, 0: 64, 44: 132, 58: 146, 1: 161, 31: 102, 29: 36, 42: 63, 53: 159, 32: 84, 45: 163, 27: 135, 39: 41, 36: 39, 63: 87, 33: 87, 56: 26}
Test Node classes: {4: 612, 7: 374, 2: 954, 11: 393, 26: 677, 21: 436, 17: 889, 35: 60

Getting node classification data:   0%|          | 0/558 [00:00<?, ?it/s]

Tokenizing data
['<node_begin>interface prototype<node_end> <edge_begin><edge_end> <node_begin>type:ApplicationFunction method 3<node_end>\n<node_begin>interface prototype<node_end> <edge_begin>type:Association<edge_end> <node_begin>method 2<node_end>\n<node_begin>interface prototype<node_end> <edge_begin>type:Association<edge_end> <node_begin>type:ApplicationFunction method 1<node_end>\n<node_begin>interface prototype<node_end> <edge_begin>type:Association<edge_end> <node_begin>clone<node_end>\n<node_begin>interface prototype<node_end> <edge_begin>type:Serving<edge_end> <node_begin>class client<node_end>', '<node_begin>instance A<node_end> <edge_begin>type:Association<edge_end> <node_begin>type:ApplicationComponent subclass prototype A<node_end>', '<node_begin>construct algoritm<node_end> <edge_begin>type:Serving<edge_end> <node_begin>type:ApplicationProcess construct some object<node_end>\n<node_begin>construct algoritm<node_end> <edge_begin>type:Realization<edge_end> <node_begin>typ

In [None]:
texts = sum([v for k, v in data.items() if k.startswith('train') and not k.endswith('classes')], [])

46019

In [14]:
class A:
    def __init__(self, name: str):
        self.name = name

class B(A):
    def __init__(self):
        super().__init__(name='XYZ')

b = B()
b.name


'XYZ'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

X_train, X_test = data['train_nodes'], data['test_nodes']
y_train, y_test = data['train_node_classes'], data['test_node_classes']

pipeline = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'), verbose=True)

print("Fitting SVM classifier")
# Train the model
pipeline.fit(X_train, y_train)

print("Predicting")
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

In [None]:
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.optim as optim

# Step 1: Train a Word2Vec model
sentences = [text.split() for text in data['train_nodes'] + data['test_nodes']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 2: Use the embeddings to transform the dataset
def get_embeddings(texts, model):
	embeddings = []
	for text in texts:
		words = text.split()
		word_vectors = [model.wv[word] for word in words if word in model.wv]
		if word_vectors:
			embeddings.append(np.mean(word_vectors, axis=0))
		else:
			embeddings.append(np.zeros(model.vector_size))
	return np.array(embeddings)

X_train_embeddings = get_embeddings(data['train_nodes'], word2vec_model)
X_test_embeddings = get_embeddings(data['test_nodes'], word2vec_model)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(data['train_node_classes'])
y_test = label_encoder.transform(data['test_node_classes'])

# Step 3: Train an SVM classifier using the embeddings
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_embeddings, y_train)
y_pred_svm = svm_classifier.predict(X_test_embeddings)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# Step 4: Train a neural network classifier using the embeddings
class SimpleNN(nn.Module):
	def __init__(self, input_dim, output_dim):
		super(SimpleNN, self).__init__()
		self.fc1 = nn.Linear(input_dim, 128)
		self.fc2 = nn.Linear(128, output_dim)
	
	def forward(self, x):
		x = torch.relu(self.fc1(x))
		x = self.fc2(x)
		return x

input_dim = X_train_embeddings.shape[1]
output_dim = len(np.unique(y_train))

model = SimpleNN(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_embeddings, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_embeddings, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Train the neural network
num_epochs = 10
for epoch in range(num_epochs):
	model.train()
	for X_batch, y_batch in train_loader:
		optimizer.zero_grad()
		outputs = model(X_batch)
		loss = criterion(outputs, y_batch)
		loss.backward()
		optimizer.step()
	print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Evaluate the neural network
model.eval()
with torch.no_grad():
	outputs = model(X_test_tensor)
	_, y_pred_nn = torch.max(outputs, 1)
	y_pred_nn = y_pred_nn.numpy()
	print("Neural Network Classification Report:")
	print(classification_report(y_test, y_pred_nn))

In [3]:
from settings import W2V_CONFIG
from gensim.models import Word2Vec
import torch
from typing import List, Union
from embeddings.common import Embedder
from sklearn.feature_extraction.text import TfidfVectorizer


class TFIDFEmbedder(Embedder):
    def __init__(self, texts: List[str]):
        print("TFIDFEmbedder: Training TF-IDF model")
        self.model = TfidfVectorizer()
        self.model.fit(texts)
        print("TFIDFEmbedder: Model trained")

    @property
    def embedding_dim(self) -> int:
        return len(self.model.get_feature_names_out())
    
    def embed(self, text: Union[str, List[str]]):
        if isinstance(text, str):
            text = [text]
        return torch.tensor(self.model.transform(text).toarray()[0])

class Word2VecEmbedder(Embedder):
    def __init__(self, texts: List[str]):
        print("Word2VecEmbedder: Training Word2Vec model")
        self.model = Word2Vec(texts, **W2V_CONFIG)
        print("Word2VecEmbedder: Word2Vec model trained")

    @property
    def embedding_dim(self) -> int:
        return self.model.vector_size
    
    def embed(self, text: Union[str, List[str]]):
        if isinstance(text, str):
            text = text.split()
        word_vectors = [self.model.wv[word] for word in text if word in self.model.wv]
        if word_vectors:
            return torch.tensor(word_vectors).mean(dim=0)
        else:
            return torch.zeros(self.embedding_dim)

In [None]:
w2v_embedder = Word2VecEmbedder(texts)
tfidf_embedder = TFIDFEmbedder(texts)

In [20]:
import networkx as nx

G = nx.Graph()
G.add_node(1, label='A')
G.add_node(2, label='B')
G.add_node(3, label='C')
G.add_node(4, label='D')
G.add_node(5, label='E')
G.add_node(6, label='F')

G.add_edge(1, 2, label='1')
G.add_edge(1, 3, label='2')
G.add_edge(1, 6, label='5')
G.add_edge(2, 3, label='6')
G.add_edge(2, 5, label='8')
G.add_edge(2, 6, label='9')
G.add_edge(3, 4, label='10')
G.add_edge(3, 5, label='11')
G.add_edge(4, 5, label='13')
G.add_edge(4, 6, label='14')
G.add_edge(5, 6, label='15')


In [21]:
from sklearn.model_selection import train_test_split


train_nodes, test_nodes = train_test_split(
	list(G.nodes), 
	test_size=0.2, 
	shuffle=True, 
	random_state=42
)

print(train_nodes, test_nodes)
nx.set_node_attributes(G, {node: 0 for node in train_nodes}, 'masked')
nx.set_node_attributes(G, {node: 1 for node in test_nodes}, 'masked')
G.nodes(data=True)

[6, 3, 5, 4] [1, 2]


NodeDataView({1: {'label': 'A', 'masked': 1}, 2: {'label': 'B', 'masked': 1}, 3: {'label': 'C', 'masked': 0}, 4: {'label': 'D', 'masked': 0}, 5: {'label': 'E', 'masked': 0}, 6: {'label': 'F', 'masked': 0}})

In [22]:
from torch_geometric.transforms import RandomLinkSplit
import torch
from data_loading.data import GraphData
import numpy as np

edge_index = np.array(G.edges()).T
transform = RandomLinkSplit(
	num_val=0, 
	num_test=0.2, 
	add_negative_train_samples=True,
	neg_sampling_ratio=1,
	split_labels=True
)

train_data, _, test_data = transform(GraphData(
	edge_index=torch.tensor(edge_index), 
	num_nodes=G.number_of_nodes()
))
nx.set_edge_attributes(G, {tuple(edge): False for edge in train_data.pos_edge_label_index.T.tolist()}, 'masked')
nx.set_edge_attributes(G, {tuple(edge): True for edge in test_data.pos_edge_label_index.T.tolist()}, 'masked')
G.edges(data=True)

EdgeDataView([(1, 2, {'label': '1', 'masked': False}), (1, 3, {'label': '2', 'masked': False}), (1, 6, {'label': '5', 'masked': False}), (2, 3, {'label': '6', 'masked': False}), (2, 5, {'label': '8', 'masked': False}), (2, 6, {'label': '9', 'masked': True}), (3, 4, {'label': '10', 'masked': False}), (3, 5, {'label': '11', 'masked': True}), (4, 5, {'label': '13', 'masked': False}), (4, 6, {'label': '14', 'masked': False}), (5, 6, {'label': '15', 'masked': False})])