#<p><center> **COMMONSENSE KNOWLEDGE BASES: MERONYMIC RELATIONSHIPS ACQUISITION AND THEIR VERIFICATION WITH LARGE LANGUAGE MODELS**

---

<p><center> Master thesis - MS in Language Analysis and Proccessing
<p><center> Julia Fidalgo Mariño
<p><center> Supervised by German Rigau</center></p>  
     


---


#<p><center> **COMMONSENSE KNOWLEDGE BASES (CSKB)**

---


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install pandas
!pip install matplotlib

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

import pandas as pd
import csv
import matplotlib.pyplot as plt

[nltk_data] Downloading package wordnet to /root/nltk_data...


###**1. FILES AND FORMAT**

*   hasPartKB
*   VisualGenome
*   TransOMCS
*   Ascent++
*   Quasimodo
*   Uncommonsense
*   AristoTupleKB
*   ConceptNet

These CSKB need to be processed in order to get this format

```
Word1 Word2 Offset1 Offset2
```

Each CSKB file has to be exploited differently. The objective is to keep only those relationships with the labels *PartOf*, *HasA* or *MemberOf* or similar.

###**1.1. HasPartKB**

In [None]:
def process_haspart(input_file):
  output_file="drive/My Drive/Colab/TFM/haspart-ord.csv"
  df = pd.read_csv(input_file, delimiter=" ")

  df.columns=['Relation', 'Word1', 'Word2', 'Score', 'Synset1', 'Offset1', 'Synset2', 'Offset2'] #rename columns
  #print(df.head(4))
  df.insert(loc=0, column='Resource', value="HP") #add column
  df_ord = df.reindex(columns=['Word1', 'Word2', 'Offset1', 'Offset2', 'Resource']) #reorder columns
  df_ord.to_csv(output_file, index=False) #save as csv
  print(df_ord)

input_file="drive/My Drive/Colab/TFM/adimen-haspart.txt"
process_haspart(input_file)

                 Word1         Word2     Offset1     Offset2 Resource
0               animal         belly  00015388-n  05556943-n       HP
1               anuran         belly  01639765-n  05556943-n       HP
2                aphid       abdomen  02252226-n  05556943-n       HP
3             arachnid       abdomen  01769347-n  05556943-n       HP
4            arthropod       abdomen  01767661-n  05556943-n       HP
...                ...           ...         ...         ...      ...
34888           animal        zygote  00015388-n  05431926-n       HP
34889              egg        zygote  01460457-n  05431926-n       HP
34890    female_'s_egg        zygote  01460457-n  05431926-n       HP
34891           specie        zygote  08110373-n  05431926-n       HP
34892  large_intestine  fermentation  05535484-n  13575433-n       HP

[34893 rows x 5 columns]


###**1.2. VisualGenome**

In [None]:
def process_visualgen(input_file):
  output_file="drive/My Drive/Colab/TFM/visualg-ord.csv"

  if input_file.endswith('.csv'):
    df = pd.read_csv(input_file)
  elif input_file.endswith('.txt'):
    df = pd.read_csv(input_file, delimiter=" ")
  else:
    print(f"File {input_file} is not compatible")
  #print(len(df))

  df.columns =['Relation', 'Offset-rel', 'Synset1', 'Offset1', 'Synset2', 'Offset2', 'Times']
  df = df.drop('Offset-rel', axis = 1)
  df2=df[(df['Relation']=='have.v.01') | (df['Relation']=='have.v.02')] #labels
  df_ord = df2.reindex(columns=['Offset1','Offset2'])
  df_ord.insert(loc=0,column='Word1',value=None) #add columns
  df_ord.insert(loc=1,column='Word2',value=None)
  df_ord.insert(loc=4,column='Resource',value="VG")

  print(df_ord)
  df_ord.to_csv(output_file, index = False)

input_file = "drive/My Drive/Colab/TFM/visualgenometriplets.txt"
process_visualgen(input_file)

       Word1 Word2     Offset1     Offset2 Resource
149613  None  None  02667576-n  03017428-n       VG
149614  None  None  02667576-n  03046257-n       VG
149615  None  None  02667576-n  04587648-n       VG
149616  None  None  05556943-n  02784732-n       VG
149617  None  None  05556943-n  00928077-n       VG
...      ...   ...         ...         ...      ...
180971  None  None  04468005-n  06807198-n       VG
180972  None  None  04522168-n  02430045-n       VG
180973  None  None  04522168-n  01621127-n       VG
180974  None  None  04587648-n  07944408-n       VG
180975  None  None  10787470-n  06793426-n       VG

[31363 rows x 5 columns]


###**1.3. TransOMCS**

In [None]:
def process_trans(input_file):
  output_file="drive/My Drive/Colab/TFM/transomcs-ord.csv"

  if input_file.endswith('.csv'):
    df = pd.read_csv(input_file)
  elif input_file.endswith('.txt'):
    df = pd.read_csv(input_file, delimiter="\t")
  else:
    print(f"File {input_file} is not compatible")

  df.columns =['Word2', 'Relation', 'Word1', 'Score']
  #print(df.head(4))
  df.insert(loc=0, column='Offset1', value=None)
  df.insert(loc=1, column='Offset2', value=None)
  df.insert(loc=2,column='Resource',value="T")

  df_rel=df[(df['Relation']=='PartOf')|(df['Relation']=='HasA')]
  df_ord = df_rel.reindex(columns=['Word1','Word2','Offset1','Offset2', 'Resource']).astype(str)
  print(df_ord)
  df_ord.to_csv(output_file, index = False)

trans_file = "drive/My Drive/Colab/TFM/TransOMCS_full.txt"
process_trans(trans_file)

            Word1           Word2 Offset1 Offset2 Resource
3771        right          people    None    None        T
3772      feeling          animal    None    None        T
3773      opinion        everyone    None    None        T
3774      bedroom           house    None    None        T
3775         name      everything    None    None        T
...           ...             ...     ...     ...      ...
18350511        i  osteoarthritis    None    None        T
18354221   belton          belton    None    None        T
18398143   nuance          nuance    None    None        T
18435327   fealty          fealty    None    None        T
18481527        i           missy    None    None        T

[1920447 rows x 5 columns]


###**1.4. Ascent++**

In [None]:
def process_ascent(input_file):
  df=pd.read_csv(asc_file, sep=',')
  #print(len(df))
  #print(df['relation'].unique().tolist())

  df.insert(loc=0, column='Offset1', value=None)
  df.insert(loc=1, column='Offset2', value=None)
  df.insert(loc=2, column='Resource', value="A++")
  df_part=df[(df['relation']=='PartOf')|(df['relation']=='MadeOf')|(df['relation']=='HasA')]
  df_part = df_part.drop(['subject_type','head','tail','subject','predicate','saliency','typicality','facets'], axis = 1) #delete columns
  df_part2 = df_part.rename(columns={'primary_subject':'Word1','object':'Word2'}) #change name of the original columns
  df_ord = df_part2.reindex(columns=['Word1','Word2','Offset1','Offset2', 'Resource'])
  print(df_ord)

  df_ord.to_csv("drive/My Drive/Colab/TFM/ascent-ord.csv", index = False)

asc_file = "drive/My Drive/Colab/TFM/ascentpp.csv"
process_ascent(asc_file)

            Word1                  Word2 Offset1 Offset2 Resource
25       aardvark   elaborate structures    None    None      A++
30       aardvark             thick skin    None    None      A++
60            aba              liability    None    None      A++
61            aba               failures    None    None      A++
93            aba        410,000 members    None    None      A++
...           ...                    ...     ...     ...      ...
2054457      year               24 hours    None    None      A++
2054568      year                meeting    None    None      A++
2054618      year           legal status    None    None      A++
2054750    yogurt       limited lifetime    None    None      A++
2054880  zucchini  the most antioxidants    None    None      A++

[106714 rows x 5 columns]


###**1.5. Quasimodo**

In [None]:
def process_quasimodo(input_file):
  output_file="drive/My Drive/Colab/TFM/quasimodo-ord.csv"

  if input_file.endswith('.csv'):
    df = pd.read_csv(input_file)
  elif input_file.endswith('.txt'):
    df = pd.read_csv(input_file, delimiter=" ")
  elif input_file.endswith('.tsv'):
    df = pd.read_csv(input_file, delimiter="\t")
  else:
    print(f"File {input_file} is not compatible")

  df.insert(loc=0, column='Offset1', value=None)
  df.insert(loc=1, column='Offset2', value=None)
  df.insert(loc=2, column='Resource', value="Q")
  df_part=df[(df['predicate']=='have')|(df['predicate']=='has_body_part')]
  df_part = df_part.drop(['modality','is_negative','score','typicality','saliency'], axis = 1)
  df_part2 = df_part.rename(columns={'subject':'Word1','object':'Word2'})
  df_ord = df_part2.reindex(columns=['Word1','Word2','Offset1','Offset2','Resource']).astype(str)
  print(df_ord)

  df_ord.to_csv(output_file, index = False)

qua_file = "drive/My Drive/Colab/TFM/quasimodo.tsv"
process_quasimodo(qua_file)

                        Word1             Word2 Offset1 Offset2 Resource
1                         man            nipple    None    None        Q
7                        fish             blood    None    None        Q
15                       baby              hair    None    None        Q
27             red blood cell           nucleus    None    None        Q
28                    toddler              hair    None    None        Q
...                       ...               ...     ...     ...      ...
6274216  two blue eyed parent  brown eyed child    None    None        Q
6274231   leopard geckos tail              tail    None    None        Q
6274293           banana skin             skins    None    None        Q
6274299            hip thrust           muscles    None    None        Q
6274499                 woman          clothing    None    None        Q

[145133 rows x 5 columns]


###**1.6. Uncommonsense**

In [None]:
def process_uncommon(input_file):

  if input_file.endswith('.csv'):
    df = pd.read_csv(input_file)
  elif input_file.endswith('.txt'):
    df = pd.read_csv(input_file, delimiter=" ")
  elif input_file.endswith('.tsv'):
    df = pd.read_csv(input_file, delimiter="\t")
  else:
    print(f"File {input_file} is not compatible")

  df.insert(loc=0, column='Offset1', value=None)
  df.insert(loc=1, column='Offset2', value=None)
  df.insert(loc=2, column='Resource', value="Un")
  df_part=df[(df['relation']=='HasA')]
  #print(df_part.head(10))
  df_part2 = df_part.rename(columns={'head':'Word1','tail':'Word2'})
  df_ord = df_part2.reindex(columns=['Word1','Word2','Offset1','Offset2','Resource']).astype(str)
  print(df_ord)

  df_ord.to_csv("drive/My Drive/Colab/TFM/uncommon-ord.csv", index = False)

un_file = "drive/My Drive/Colab/TFM/uncommon-neg.tsv"
process_uncommon(un_file)

               Word1                   Word2 Offset1 Offset2 Resource
22        disability               institute    None    None       Un
170      grandfather  the graves of soldiers    None    None       Un
429          hearing    significant activity    None    None       Un
443           osprey               the range    None    None       Un
524      oil company     omega-9 fatty acids    None    None       Un
...              ...                     ...     ...     ...      ...
7999846      cruiser              long wicks    None    None       Un
7999857      country     long history of use    None    None       Un
7999941       cornea               much spin    None    None       Un
7999976    polyester      a half-load option    None    None       Un
7999983       bidder            clear appeal    None    None       Un

[117552 rows x 5 columns]


###**1.7. AristoTupleKB**

In [None]:
ar_file = "drive/My Drive/Colab/TFM/AristoCombined.tsv"
df6 = pd.read_csv(ar_file, delimiter='\t')
#print(df6.head(10))
df6.insert(loc=0, column='Offset1', value=None)
df6.insert(loc=1, column='Offset2', value=None)
df6.insert(loc=2, column='Resource', value="Ar")
df6_part=df6[(df6['Pred']=='has-part')] #selection 1
df6_part2 = df6_part.rename(columns={'Arg1':'Word1','Arg2':'Word2'})
df6_ord = df6_part2.reindex(columns=['Word1','Word2','Offset1','Offset2','Resource']).astype(str)
#print(df6_ord)

df6_ord.to_csv("drive/My Drive/Colab/TFM/Aristo-ord2.csv", index = False)

In [None]:
ar_file = "drive/My Drive/Colab/TFM/AristoCombined.tsv"
df6 = pd.read_csv(ar_file, delimiter='\t')
#print(df6.head(10))
df6.insert(loc=0, column='Offset1', value=None)
df6.insert(loc=1, column='Offset2', value=None)
df6.insert(loc=2, column='Resource', value="Ar")
df6_part2=df6[(df6['Pred']=='is-part-of')] #selection 2
df6_part2 = df6_part2.rename(columns={'Arg1':'Word2','Arg2':'Word1'}) #different order
df6_ord2 = df6_part2.reindex(columns=['Word1','Word2','Offset1','Offset2','Resource']).astype(str)
#print(df6_ord2)

df6_ord2.to_csv("drive/My Drive/Colab/TFM/Aristo-ord3.csv", index = False)
df61 = pd.read_csv("drive/My Drive/Colab/TFM/Aristo-ord2.csv")
df62 = pd.read_csv("drive/My Drive/Colab/TFM/Aristo-ord3.csv")

In [None]:
combined = pd.concat([df61, df62], ignore_index=True).astype(str)
combined.to_csv("drive/My Drive/Colab/TFM/aristo-ord.csv", index=False)
print(len(combined))
#print(combined.head(4))

40025


In [None]:
print(len(df61))
print(len(df62))

32587
7438


###**1.8. ConceptNet**

###*1.8.1. API*

In [None]:
import requests
from multiprocessing import Pool

def query_conceptnet(offset):
    base_url = "http://api.conceptnet.io"
    endpoint = "/query"

    query = {
        "limit": 1000,
        "offset": offset,
        "language": "en",
        "start": "/c/en",
        "end": "/c/en",
        "rel": ["/r/PartOf", "/r/MadeOf"]
    }

    response = requests.get(f"{base_url}{endpoint}", params=query)
    if response.status_code == 200:
        data = response.json()
        edges = data["edges"]
        relation_pairs = set()
        for edge in edges:
            start_label = edge["start"]["label"]
            end_label = edge["end"]["label"]
            relation_pairs.add((start_label, end_label))
        return relation_pairs
    else:
        return set()

def get_english_relations():
    with Pool(processes=4) as pool:
        offsets = range(0, 10000, 100)
        results = pool.map(query_conceptnet, offsets)
    relation_pairs = set().union(*results)
    return relation_pairs

def save_to_csv(relation_pairs, filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Word1", "Word2"])
        for pair in relation_pairs:
            writer.writerow(pair)

if __name__ == "__main__":
    relation_pairs = get_english_relations()
    csv_filename = "drive/My Drive/Colab/TFM/conceptnet-rel.csv"
    save_to_csv(relation_pairs, csv_filename)
    print("ok")

ok


###*1.8.2. File and format*

In [None]:
def process_conceptnet(input_file):
  output_file="drive/My Drive/Colab/TFM/conceptnet-ord.csv"

  df = pd.read_csv(input_file, delimiter=',')
  df.insert(loc=0, column='Offset1', value=None)
  df.insert(loc=1, column='Offset2', value=None)
  df.insert(loc=2, column='Resource', value="C")
  df_ord = df.reindex(columns=['Word1','Word2','Offset1','Offset2','Resource']).astype(str)
  print(df_ord)

  df_ord.to_csv(output_file, index = False)

input_file = "drive/My Drive/Colab/TFM/conceptnet-rel.csv"
process_conceptnet(input_file)

                Word1                 Word2 Offset1 Offset2 Resource
0        inflammation  inflammatory disease    None    None        C
1            šiauliai             samogitia    None    None        C
2                byte                  word    None    None        C
3     corpus callosum             forebrain    None    None        C
4     Hottentot bread       elephant's foot    None    None        C
...               ...                   ...     ...     ...      ...
9131          Bishkek            Kyrgyzstan    None    None        C
9132            isère           rhône alpes    None    None        C
9133    Libyan dirham          Libyan dinar    None    None        C
9134        east asia                taipei    None    None        C
9135         syllable                  word    None    None        C

[9136 rows x 5 columns]


###**2. WORDNET**

For those CSKB that do not have their four columns completed, it is needed the access to WordNet in order to obtain the offsets or synsets for every element the file.

In [None]:
import os

###**2.1. Get offset from a word**

In [None]:
def get_synset_offset(word):
    word=str(word) #(AttributeError: 'float' object has no attribute 'lower')
    synsets=wn.synsets(word)
    if synsets:
        offset=synsets[0].offset()
        return f"{offset:08d}-{synsets[0].pos()}"
    else:
        return None

In [None]:
files = ['drive/My Drive/Colab/TFM/transomcs-ord.csv',
        'drive/My Drive/Colab/TFM/ascent-ord.csv',
         'drive/My Drive/Colab/TFM/quasimodo-ord.csv',
         'drive/My Drive/Colab/TFM/uncommon-ord.csv',
         'drive/My Drive/Colab/TFM/aristo-ord.csv',
         'drive/My Drive/Colab/TFM/conceptnet-ord.csv'
         ]

for f in files:
    df=pd.read_csv(f)
    df['Offset1']=df['Word1'].apply(get_synset_offset) #for each offset, add the corresponding word in the row
    df['Offset2']=df['Word2'].apply(get_synset_offset)
    nuevo=os.path.splitext(f)[0] + "-full.csv"
    df.to_csv(nuevo, index=False)

###**2.2. Get word from an offset**

In [None]:
def get_word_from_offset(offset):
    pos=offset[-1]
    synset_id=int(offset[:-2])

    synset=wn.synset_from_pos_and_offset(pos, synset_id)
    if synset:
        return synset.lemma_names()[0]
    else:
        return None

In [None]:
input="drive/My Drive/Colab/TFM/visualg-ord.csv"
df=pd.read_csv(input)
df['Word1']=df['Offset1'].apply(get_word_from_offset) #add word to each offset
df['Word2']=df['Offset2'].apply(get_word_from_offset)
#print(df)

df.to_csv("drive/My Drive/Colab/TFM/visualg-ord-full.csv", index = False)

###**3. CLEANING**

In [None]:
def clean_file(file):
  df = pd.read_csv(file)
  df = df.dropna(subset=['Offset1', 'Offset2'])
  df = df.dropna(subset=['Word1', 'Word2'])
  df = df[~(df['Word1'].str.contains(' ') | df['Word2'].str.contains(' '))] #multiple words
  df.to_csv(file, index=False)
  print(len(df))
files = ['drive/My Drive/Colab/TFM/transomcs-ord-full.csv',
        'drive/My Drive/Colab/TFM/ascent-ord-full.csv',
         'drive/My Drive/Colab/TFM/quasimodo-ord-full.csv',
         'drive/My Drive/Colab/TFM/uncommon-ord-full.csv',
         'drive/My Drive/Colab/TFM/aristo-ord-full.csv',
         'drive/My Drive/Colab/TFM/conceptnet-ord-full.csv',
         'drive/My Drive/Colab/TFM/visualg-ord-full.csv',
         'drive/My Drive/Colab/TFM/haspart-ord.csv'
         ]
for file in files:
    file=clean_file(file)

print(f"ok")

1616507
40751
27096
19889
27744
3369
31363
34893
ok


In [None]:
#save files in one document
def combine(files, outputfile):
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        dfs.append(df)

    comb=pd.concat(dfs, ignore_index=True) #join df
    print(len(comb))
    print(comb.head(5))
    print(comb.tail(5))
    comb.to_csv(outputfile, sep=',', index=False)

files = ['drive/My Drive/Colab/TFM/transomcs-ord-full.csv',
        'drive/My Drive/Colab/TFM/ascent-ord-full.csv',
         'drive/My Drive/Colab/TFM/quasimodo-ord-full.csv',
         'drive/My Drive/Colab/TFM/uncommon-ord-full.csv',
         'drive/My Drive/Colab/TFM/aristo-ord-full.csv',
         'drive/My Drive/Colab/TFM/conceptnet-ord-full.csv',
         'drive/My Drive/Colab/TFM/visualg-ord-full.csv',
         'drive/My Drive/Colab/TFM/haspart-ord.csv'
         ]
outputfile = "drive/My Drive/Colab/TFM/CSKB.csv"
combine(files, outputfile)

print(f"ok")

1801612
     Word1   Word2     Offset1     Offset2 Resource
0    right  people  05174653-n  07942152-n        T
1  feeling  animal  00026192-n  00015388-n        T
2  bedroom   house  02821627-n  03544360-n        T
3  meaning    word  06601327-n  06286395-n        T
4  feeling  people  00026192-n  07942152-n        T
                   Word1         Word2     Offset1     Offset2 Resource
1801607           animal        zygote  00015388-n  05431926-n       HP
1801608              egg        zygote  01460457-n  05431926-n       HP
1801609    female_'s_egg        zygote  01460457-n  05431926-n       HP
1801610           specie        zygote  08110373-n  05431926-n       HP
1801611  large_intestine  fermentation  05535484-n  13575433-n       HP
ok


###**4. APPEARANCES**



###**4.1. Times that a relationship appears**


In [None]:
def appearances(input_file):
    df = pd.read_csv(input_file)
    grouped = df.groupby(['Word1', 'Word2', 'Offset1', 'Offset2']).agg({'Resource': lambda x: ', '.join(sorted(x.unique()))}).reset_index()
    grouped['Appearances'] = df.groupby(['Word1', 'Word2', 'Offset1', 'Offset2']).size().values
    output_file = "drive/My Drive/Colab/TFM/appearances-2.csv"
    grouped.to_csv(output_file, index=False)
    print(grouped)

input_file = "drive/My Drive/Colab/TFM/CSKB.csv"
appearances(input_file)

                             Word1      Word2     Offset1     Offset2  \
0        -PRON-_be_skeletal_muscle   creatine  05289861-n  14825243-n   
1                               10       arms  13746512-n  04566257-n   
2                             1000       feet  13750844-n  05563266-n   
3                            10000       feet  13751265-n  05563266-n   
4                               11       nose  13746672-n  05598147-n   
...                            ...        ...         ...         ...   
1732403                     zygote    vacuole  05431926-n  05447087-n   
1732404                   zygotene   prophase  13575109-n  13542474-n   
1732405                    zygotic       post  02882275-a  08624385-n   
1732406                     zymase  phosphate  15109586-n  14982265-n   
1732407                    zymurgy   magazine  06080361-n  06595351-n   

        Resource  Appearances  
0             HP            1  
1              Q            1  
2              Q           

###**4.2. Groups**

In [None]:
df = pd.read_csv('drive/My Drive/Colab/TFM/appearances-2.csv')

count = df['Appearances'].value_counts().sort_index() #count groups
for numero, cantidad in count.items():
    print(f"{numero}: {cantidad}")

1: 1671832
2: 54009
3: 5135
4: 1014
5: 272
6: 101
7: 33
8: 9
9: 2
14: 1


###**4.3. Random samples**
150 random samples to evaluate manually.

In [None]:
import random

df = pd.read_csv('drive/My Drive/Colab/TFM/appearances-2.csv')
counts = df['Appearances'].value_counts()

muestras=150
selection={1:30, 2:20, 3:30, 4:20, 5:20, 6:17, 7:10, 8:3}
muestrasrandom= []

for numero, cantidad_muestras in selection.items():
    subset = df[df['Appearances'] == numero] #only for this column
    if len(subset) >= cantidad_muestras:
        muestras = subset.sample(n=cantidad_muestras, random_state=1)
        muestrasrandom.extend(muestras.values)

In [None]:
for m in muestrasrandom:
    print(m)

Random samples:
['martyr' 'wheelchair' '10296618-n' '04576002-n' 'T' 1]
['win' 'hound' '07354731-n' '02087551-n' 'T' 1]
['touch' 'bill' '07409592-n' '06536853-n' 'T' 1]
['check' 'type' '13381734-n' '05840188-n' 'T' 1]
['dip' 'baby' '13904506-n' '09827683-n' 'T' 1]
['actor' 'doubles' '09765278-n' '00483605-n' 'Q' 1]
['aversion' 'alcohol' '07502669-n' '07884567-n' 'T' 1]
['call' 'driver' '06272803-n' '10034906-n' 'T' 1]
['balance' 'plan' '14002279-n' '05898568-n' 'T' 1]
['party' 'draft' '08256968-n' '13377268-n' 'T' 1]
['phaeton' 'i' '04459122-n' '14641397-n' 'T' 1]
['dessert' 'space' '07609840-n' '00028651-n' 'T' 1]
['move' 'bird' '00165942-n' '01503061-n' 'T' 1]
['agoraphobia' 'woman' '14381840-n' '10787470-n' 'T' 1]
['place' 'osmosis' '08664443-n' '13528100-n' 'T' 1]
['world' 'galaxy' '09466280-n' '08270938-n' 'T' 1]
['signpost' 'banner' '06794374-n' '02788021-n' 'VG' 1]
['occupation' 'armchair' '00582388-n' '02738535-n' 'T' 1]
['code' 'zone' '06667317-n' '08688247-n' 'T' 1]
['omaha' 

In [None]:
#reorder and save
df_final=pd.DataFrame(muestrasrandom, columns=df.columns)
df_final_ord=df_final.sort_values(by='Appearances', ascending=False)
df_final_ord.to_csv('drive/My Drive/Colab/TFM/PRUEBA/relations-2.csv', sep='\t', index=False)
print(len(df_final_ord))

150
