### Импорты

In [14]:
import sqlite3
import pandas as pd
import nltk
nltk.download('framenet_v17')
from nltk.corpus import framenet as fn
import re

[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/annsmklv/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


### База данных

In [2]:
con = sqlite3.connect('multilinual_FrameNet.db')  
cur = con.cursor()  

### Berkeley FrameNet

Данные для BFN будем брать из nltk

#### 1. Фреймы

In [8]:
data = []
for frame in fn.frames():
    data.append((int(frame.ID), frame.name, " ".join([fe for fe in frame.FE]), frame.definition))

In [22]:
example = data[:3]
bfn_frames = pd.DataFrame({'id': [x[0] for x in example],
                           'frame':[x[1] for x in example],
                           'frame_elements':[x[2] for x in example],
                           'definition':[x[3] for x in example]}, index=None)
bfn_frames


Unnamed: 0,id,frame,frame_elements,definition
0,2031,Abandonment,Agent Theme Place Time Manner Duration Explana...,An Agent leaves behind a Theme effectively ren...
1,262,Abounding_with,Theme Location Degree Depictive Time,A Location is filled or covered with the Theme...
2,830,Absorb_heat,Entity Container Heat_source Place Medium Mann...,An Entity (generally food) is exposed to a Hea...


Загружаем в таблицу

In [9]:
cur.execute("""
CREATE TABLE bfn (
    id INT, 
    frame TEXT, 
    frame_elements TEXT,
    definition TEXT,
    PRIMARY KEY (id)
)
""")
cur.executemany("INSERT INTO bfn VALUES (?, ?, ?, ?)", data)
con.commit()

#### 2. Лексические единицы

In [23]:
lexical_units = []
for frame in fn.frames():
    frame_id = frame.ID
    lus = frame.lexUnit.items()
    for k, v in lus:
        id = v.ID
        word, pos = k.split('.')
        lexical_units.append((id, word, pos, frame_id))

In [25]:
example = lexical_units[:3]
example_lus = pd.DataFrame({'id': [x[0] for x in example],
                           'word':[x[1] for x in example],
                           'pos':[x[2] for x in example],
                           'frame_id':[x[3] for x in example]}, index=None)
example_lus


Unnamed: 0,id,word,pos,frame_id
0,14839,abandon,v,2031
1,14841,leave,v,2031
2,14842,abandonment,n,2031


Создаем таблицу и добавляем туда наши данные

In [24]:
cur.execute("""
CREATE TABLE bfn_lu (
    id INT, 
    word TEXT,
    pos TEXT,
    frame_id INT, 
    PRIMARY KEY (id)
)
""")
cur.executemany("INSERT INTO bfn_lu VALUES (?, ?, ?, ?)", lexical_units)
con.commit()

#### 3. Типы отношений и отношения между фреймами 

In [3]:
types = []
for type in list(fn.frame_relation_types()):
    types.append((int(type.ID), type.name, type.superFrameName, type.subFrameName, ''))

In [8]:
example = types[:3]
example_types = pd.DataFrame({'id': [x[0] for x in example],
                           'name':[x[1] for x in example],
                           'superframe':[x[2] for x in example],
                           'subframe':[x[3] for x in example]}, index=None)
example_types

Unnamed: 0,id,name,superframe,subframe
0,10,Causative_of,Causative,Inchoative/state
1,9,Inchoative_of,Inchoative,Stative
2,1,Inheritance,Parent,Child


In [4]:
cur.execute("""
CREATE TABLE relations (
    id INT, 
    type TEXT,
    superframe TEXT,
    subframe TEXT,
    definition TEXT, 
    PRIMARY KEY (id)
)
""")
cur.executemany("INSERT INTO relations VALUES (?, ?, ?, ?, ?)", types)
con.commit()

Так как соединяем по id, создадим словарь, в котором ключами будут названия фреймов, а значениями - их id

In [5]:
ids = {}
for t in types:
    ids[t[1]] = t[0]

In [6]:
bfn_relations = []
for rel in fn.frame_relations():
    bfn_relations.append((rel.superFrame.ID, rel.subFrame.ID, ids[rel.type.name]))

In [9]:
example = bfn_relations[:3]
example_rel = pd.DataFrame({'parent_id': [x[0] for x in example],
                           'child_id':[x[1] for x in example],
                           'relation_id':[x[2] for x in example]}, index=None)
example_rel

Unnamed: 0,parent_id,child_id,relation_id
0,262,1904,1
1,1602,1603,1
2,124,236,1


In [7]:
cur.execute("""
CREATE TABLE BFN_relations (
    parent_id INT, 
    child_id INT, 
    relation_id TEXT
)
""")
cur.executemany("INSERT INTO BFN_relations VALUES (?, ?, ?)", bfn_relations)
con.commit()

### Ресурсы

Информация была собрана нами вручную.

In [10]:
meta = [
    ('DiCoEnviro', 'Экология', 'Французский, английский, португальский, китайский',	'https://olst.ling.umontreal.ca/dicoenviro/framed/index.php', '-', '', 203, None, ''),
    ('DiCoInfo', 'Информация, интернет', 'Французский, английский, арабский', 'https://olst.ling.umontreal.ca/dicoinfo/framed/', '-', '', None, None, 'https://olst.ling.umontreal.ca/dicoinfo/framed/'),
    ('SweFN', 'Без определённой тематики', 'Шведский', 'https://spraakbanken.gu.se/karp/#?mode=DEFAULT&resources=swefn&lang=eng&advanced=false', 'https://spraakbanken.gu.se/en/resources?s=SweFN&language=All', '', 1195, 39210, 'https://spraakbanken.gu.se/en/resources/swefn'),
    ('FrameNet Brasil', 'Без определённой тематики', 'Бразильский португальский, английский (вероятно, ещё)', 'https://webtool.framenetbr.ufjf.br/index.php/webtool/report/frame/main', '-', '', None, None, ''),
    ('German FrameNet', 'Без определённой тематики', 'Немецкий', 'https://framenet-constructicon.hhu.de/framenet/', 'https://www.coli.uni-saarland.de/projects/salsa/corpus/', '', 1285, 13905, 'https://framenet-constructicon.hhu.de/project/publications'),
    ('Spanish FrameNet', 'Без определённой тематики', 'Испанский', 'http://gemini.uab.es/SFN', '-', '', None, None, ''),
    ('BiFrameNet', 'Без определённой тематики', 'Английский', 'https://cse.hkust.edu.hk/~hltc/BiFrameNet/ontology/index.html', '-', '', None, None, ''),
    ('GFOL', 'Без определённой тематики', 'Немецкий', 'https://coerll.utexas.edu/frames/frame-index', '-', '', 29, None, '')
]								

In [12]:
example = meta[:5]
example_meta = pd.DataFrame({'resource': [x[0] for x in example],
                           'theme':[x[1] for x in example],
                           'language':[x[2] for x in example],
                           'online':[x[3] for x in example],
                           'offline':[x[4] for x in example],
                           'library':[x[5] for x in example],
                           'size':[x[6] for x in example],
                           'number_of_LUs':[x[7] for x in example],
                           'publications':[x[8] for x in example]
                           }, index=None)
example_meta

Unnamed: 0,resource,theme,language,online,offline,library,size,number_of_LUs,publications
0,DiCoEnviro,Экология,"Французский, английский, португальский, китайский",https://olst.ling.umontreal.ca/dicoenviro/fram...,-,,203.0,,
1,DiCoInfo,"Информация, интернет","Французский, английский, арабский",https://olst.ling.umontreal.ca/dicoinfo/framed/,-,,,,https://olst.ling.umontreal.ca/dicoinfo/framed/
2,SweFN,Без определённой тематики,Шведский,https://spraakbanken.gu.se/karp/#?mode=DEFAULT...,https://spraakbanken.gu.se/en/resources?s=SweF...,,1195.0,39210.0,https://spraakbanken.gu.se/en/resources/swefn
3,FrameNet Brasil,Без определённой тематики,"Бразильский португальский, английский (вероятн...",https://webtool.framenetbr.ufjf.br/index.php/w...,-,,,,
4,German FrameNet,Без определённой тематики,Немецкий,https://framenet-constructicon.hhu.de/framenet/,https://www.coli.uni-saarland.de/projects/sals...,,1285.0,13905.0,https://framenet-constructicon.hhu.de/project/...


In [11]:
cur.execute("""
CREATE TABLE resources (
    resource TEXT,
    theme TEXT,
    language TEXT, 
    online_availability TEXT,
    offline_availability TEXT,
    library TEXT,
    size INT,
    number_of_LUs INT,
    publication TEXT
)
""")
cur.executemany("INSERT INTO resources VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", meta)
con.commit()

### SweFN

Данные из SweFN будем доставать парсингом xml-файла

#### 1. Фреймы

In [13]:
import xml.etree.ElementTree as ET
tree = ET.parse('./swefn-full/swefn.xml')
root = tree.getroot()
lexical_entries = root.findall('Lexicon/LexicalEntry/Sense')

In [15]:
data = []
for i, le in enumerate(lexical_entries):
    id = i
    frame = le.attrib['id'].split('--')[1]
    frame_elements = []
    definition = ''
    attrs = [x.attrib for x in le.findall('feat')]
    for attr in attrs:
        if attr['att'] == 'coreElement' or attr['att'] == 'peripheralElement':
            frame_elements.append(attr['val'])
        elif attr['att'] == 'definition':
            definition = re.sub('<.*?>', '', attr['val'])
    data.append((i, frame, " ".join(frame_elements), definition))

In [19]:
cur.execute("""
CREATE TABLE swefn (
    id INT, 
    frame TEXT, 
    frame_elements TEXT,
    definition TEXT,
    PRIMARY KEY (id)
)
""")
cur.executemany("INSERT INTO swefn VALUES (?, ?, ?, ?)", data)
con.commit()

#### 2. Лексические единицы

In [25]:
import xml.etree.ElementTree as ET
tree = ET.parse('./swefn-full/swefn-ex.xml')
root = tree.getroot()
tokens = root.findall('corpus/text/sentence/token')

In [None]:
import xml.etree.ElementTree as ET
tree = ET.parse('./swefn-full/swefn-ex.xml')
root = tree.getroot()
tokens = root.findall('corpus/text/sentence/token')

### Связи между фреймами в BFN и фреймами в SweFN

In [20]:
import xml.etree.ElementTree as ET
tree = ET.parse('./swefn-full/swefn.xml')
root = tree.getroot()
lexical_entries = root.findall('Lexicon/LexicalEntry/Sense')

Большая часть фреймов в шведском фреймнете являются копией фреймов из BFN, поэтому мы вытаскиваем аттрибут BFNID. Для фреймов, для которых нет такого аттрибута, мы будем проставлять связи вручную

In [21]:
collocations = []
for i, le in enumerate(lexical_entries):
    frame = le.attrib['id'].split('--')[1]
    feats = le.findall('feat')
    if_colloc = False
    for f in feats:
        if f.attrib['att'] == "BFNID":
            collocations.append((frame, f.attrib['val']))
            if_colloc = True
    if not if_colloc:
        collocations.append((frame, None))

In [22]:
query = """
SELECT id, frame
FROM bfn
"""
cur.execute(query)
results = cur.fetchall()

In [23]:
bfn = {}
for r in results:
    bfn[r[1]] = r[0]
    
bfn_swefn = []
for i, frame in enumerate(collocations):
    if frame[1] in bfn:
        bfn_swefn.append((bfn[frame[1]], i, "ExactMatch"))
    else:
        bfn_swefn.append((None, i, "??"))

In [24]:
cur.execute("""
CREATE TABLE bfn_swefn_match (
    bfn_frame_id INT, 
    swefn_frame_id INT, 
    match TEXT
)
""")
cur.executemany("INSERT INTO bfn_swefn_match VALUES (?, ?, ?)", bfn_swefn)
con.commit()

### Надо доделать

##### SweFN lexical units

In [11]:
import xml.etree.ElementTree as ET
tree = ET.parse('./swefn-full/swefn-ex.xml')
root = tree.getroot()
tokens = root.findall('corpus/text/sentence/token')

In [14]:
query = """
SELECT id, frame
FROM swefn
"""
cur.execute(query)
results = cur.fetchall()

lus = []
for i, token in enumerate(tokens):
    token_dict = token.attrib
    if token_dict['swefn'] == '|':
        continue
    frame = token_dict['swefn'].replace('|', '')
    word = token_dict['lemma'].replace('|', '')
    pos = token_dict['pos']
    if frame not in res:
        lus.append((i, word, pos, frame))
    else:
        lus.append((i, word, pos, res[frame]))
cur.execute("""
CREATE TABLE swefn_lu (
    id INT, 
    word TEXT,
    pos TEXT,
    frame_id INT, 
    PRIMARY KEY (id)
)
""")
cur.executemany("INSERT INTO swefn_lu VALUES (?, ?, ?, ?)", lus)
con.commit()

In [22]:
cur.executemany("INSERT INTO swefn_lu VALUES (?, ?, ?, ?)", lus)
con.commit()

##### SweFN relations

In [6]:
cur.execute("""
CREATE TABLE SweFN_relations (
    parent_id INT, 
    child_id INT, 
    relation_id TEXT
)
""")

<sqlite3.Cursor at 0x7ff63fc5f340>

In [10]:
con.commit()