In [3]:
from xml.dom import minidom
import re

In [4]:
path_to_lattice = "argmicrotext_contexts/merged_lattice.lat.xml"
path_to_testLattice = "argmicrotext_contexts/testBasic.lat.xml"

# define document from xml path
doc = minidom.parse(path_to_testLattice)

In [64]:
def show_lattice(lattice):
    """
    from a lattice dictionnary in this shape : 
    {ccpt_id : {extents : [],
               intents : [],
               fathers : []}}
    outputs a visual representation
    """
    for (ccpt_id, descr) in lattice.items():
        print("CONCEPT :  " + str(ccpt_id))
        print("EXTENTS : " + str(descr['extents']))
        print("INTENTS : " + str(descr['intents']))
        print("fathers : " + str(descr['fathers']))
        print("children : " + str(descr['children']))
        print("\n")  

def print_descr(descr):
    """
    prints description
    """
    print("EXTENTS : " + str(descr['extents']))
    print("INTENTS : " + str(descr['intents']))
    print("fathers : " + str(descr['fathers']))
    print("children : " + str(descr['children']))
    print("\n\n")
        
def reduce_intents(lattice):
    """
    outputs a new lattice with reduced intents
    """
    output_lattice = {}
    for (ccpt_id, descr) in lattice.items():
        old_intents = [lattice[x]['intents'] for x in descr['fathers']]
        old_intents = [y for x in old_intents for y in x]
        new_intents = list(set(descr['intents'])-set(old_intents))
        output_lattice[ccpt_id] = { 'extents' : descr['extents'],
                                    'intents' : new_intents,
                                    'fathers' : descr['fathers'],
                                    'children' : descr['children']
                                  }
    return output_lattice

def reduce_extents(lattice):
    """
    outputs a new lattice with reduced intents
    """
    output_lattice = {}
    for ccpt_id in reversed(list(lattice.keys())):
        descr = lattice[ccpt_id]
        old_extents = [lattice[x]['extents'] for x in descr['children']]
        old_extents = [y for x in old_extents for y in x]
        new_extents = list(set(descr['extents'])-set(old_extents))
        output_lattice[ccpt_id] = { 'extents' : new_extents,
                                    'intents' : descr['intents'],
                                    'fathers' : descr['fathers'],
                                    'children' : descr['children']
                                  }
    return output_lattice

def write_lattice(lattice, reduced_lattice, output_file):
    """
    writes a file containing a concept per line
    #concept_id  
    """
    lines = []
    with open(output_file, 'w') as output_file:
        for ccpt_id in lattice.keys():
            line = []
            # add conceptid
            line.append("#"+str(ccpt_id))
            
            # add intent
            line.append('int="'+';'.join([str(x) for x in lattice[ccpt_id]["intents"]])+'"')

            # add intent herite
            line.append('int_h="'+';'.join([str(x) for x in reduced_lattice[ccpt_id]["intents"]])+'"')

            # add extent
            line.append('ext="'+';'.join([str(x) for x in lattice[ccpt_id]["extents"]])+'"')
            
            # add extent herite
            line.append('ext_h="'+';'.join([str(x) for x in reduced_lattice[ccpt_id]["extents"]])+'"')

            # add pere
            line.append('pere="'+';'.join([str(x) for x in lattice[ccpt_id]["fathers"]])+'"')

            # add fils
            line.append('fils="'+';'.join([str(x) for x in lattice[ccpt_id]["children"]])+'"')
            lines.append(' '.join(line)+"\n")
        output_file.writelines(lines)

        
def get_non_empty(lattice):
    """
    from a lattice, recover only non empty extents and intents
    """
    output_lattice = {}
    for (ccpt_id, descr) in lattice.items():
        # if extents and intents are not empty, add to output lattice
        if descr['extents'] != [] and descr['intents'] != []:
            output_lattice[ccpt_id] = descr
    return output_lattice

def get_concept(lattice, ccpt_id):
    """
    from a lattice, recover concept
    """
    return lattice[ccpt_id]
    
def get_non_empty_extent(lattice):
    """
    from a lattice, recover only non empty extents and intents
    """
    output_lattice = {}
    for (ccpt_id, descr) in lattice.items():
        # if extents and intents are not empty, add to output lattice
        if descr['extents'] != [] :
            output_lattice[ccpt_id] = descr
    return output_lattice


def print_concept_from_object(obj_id, lattice, reduced):
    """
    in the reduced lattice, output the concept containing object
    """
    output = None
    for (ccpt_id, descr) in lattice.items():
        red_descr = reduced[ccpt_id]
        if str(obj_id) in red_descr["extents"]:
            print("CONCEPT :  " + str(ccpt_id))
            print("EXTENTS H : " + str(red_descr['extents']))
            print("INTENTS H : " + str(red_descr['intents']))
            print("INTENT : "+ str(descr['extents']))
            print("EXTENT : "+ str(descr['intents']))
            print("fathers : " + str(descr['fathers']))
            print("children : " + str(descr['children']))
            print("\n")  
        
        
def get_nodes(path2xmlcontext):
    """
    from a xml context path, outputs a list of nodes concepts in the given form
    node_id : {
                extents : []
                intents : []
                fathers : []
            }
                
    """
    output_lattice = {}
    doc = minidom.parse(path2xmlcontext)
    
    # init regex patterns 
    regexp_ext = r'OBJ id="(\d*)"'
    regexp_int = r'ATT id="(\d*)"'
    regexp_father = r'PARENT id="(\d*)"'
    regexp_objid = r'<OBJ id=\"(\d+)\">(\w+)</OBJ>'
    regexp_attrid = r'<ATT id=\"(\d+)\">(\w+)</ATT>'
    regexp_ctxid = r'<NOD id="(\d*)">'
    
    obj_dict = {}
    attr_dict = {}
    
    # set objects and attributes dictionnaries
    for (objid, objname) in re.findall(regexp_objid, doc.toprettyxml()):
        obj_dict[objid] = objname

    for (attrid, attrname) in re.findall(regexp_attrid, doc.toprettyxml()):
        attr_dict[attrid] = attrname

    # loop over nodes
    for node in doc.getElementsByTagName('NOD'):
        cur_node_xml = node.toprettyxml()
        
        # define id
        ctx_id = re.search(regexp_ctxid, cur_node_xml).group(1)
        extents = [obj_dict[x] for x in re.findall(regexp_ext, cur_node_xml)]
        intents = [attr_dict[x] for x in re.findall(regexp_int, cur_node_xml)]
        fathers = [int(x) for x in re.findall(regexp_father, cur_node_xml)]
        
        output_lattice[int(ctx_id)] = { 'extents' : extents,
                                        'intents' : intents,
                                        'fathers' : fathers
                                      }
    # add children to lattice
    complete_lattice = {}
    
    # first loop to create output lattice
    for (ccpt_id, descr) in output_lattice.items():
        new_descr = {'extents' : descr['extents'],
                     'intents' : descr['intents'],
                     'fathers' : descr['fathers'],
                     'children' : []
                    }
        complete_lattice[ccpt_id] = new_descr
    
    # second loop to add data to new complete lattice
    for (ccpt_id, descr) in output_lattice.items():
        # init new dict with father
        if descr['fathers'] is not None :
            # loop over each father
            for father in descr['fathers']:
                complete_lattice[father]['children'].append(ccpt_id)
                
    return complete_lattice 

In [30]:
lattice = get_nodes(path_to_lattice)
reduced = reduce_intents(lattice)
reduced = reduce_extents(reduced)

# 1 - Y-a-t-il des structures communes à certains textes, et qui n'appartiennent à aucun autre texte
=> Concepts des textes (ens. T) possédant des structures (ens . S) similaires et où S n'apparaît dans aucun texte =/= T

**On cherche** : Extent et intent hérités non vides

In [45]:
non_empty = get_non_empty(reduced)
show_lattice(non_empty)

CONCEPT :  1879
EXTENTS : ['12']
INTENTS : ['r_70']
fathers : [1474, 725, 944, 1595, 1102, 1468, 1766, 1441]
children : [1950]


CONCEPT :  1843
EXTENTS : ['71', '83']
INTENTS : ['a_12', 'a_80']
fathers : [728, 1707, 935, 1282, 738, 930, 1477, 931, 1154, 737, 1185, 1283]
children : [2031]




### Interprétation des résultats : 

1. **Concept 1879** : Les objets 12 et 14 partagent une structure RST strictement commune qu'aucun autre texte ne contient   
    * Textes   
    
        - B013  (12)   
        - B015 (14)  
        
    * Pattern :   
    
        v 0 CC  
        v 1 _  
        v 2 _  
        v 3 _  
        v 4 _  
        e 0 1 reason_r  
        e 0 3 reason_r  
        e 1 2 concession_r  
        e 3 4 joint_m  
       
    
2. **Concept 1843** : Les objets 71 et 83 partagent une structure ARG strictement commune qu'aucun autre texte ne contient  
    * **Textes**   
    
        - D10  (12)   
        - D22 (14)  
        
    * **Pattern** :   (il partagent a_12 et a_80, mais a_12 fait partie de a_80, cependant, comme a_12 apparaît aussi dans l'intent hérité, a_12 n'apparaît nulle part ailleurs que dans ces deux textes également) 
    
        v 0 CC  
        v 1 _  
        v 2 _  
        v 3 _  
        v 4 _  
        e 0 1 sup  
        e 1 2 add  
        e 2 3 reb  
        e 3 4 und  
 

# 2 - Y-a-t-il des objets qui partagent des structures ARG et RST ? quelles interprétations différentes ? 
=> 

**On cherche** : Extent hérité non vides
Pour chaque concept, reg quels sont les objets et les attributs

In [51]:
concepts_having_objects = get_non_empty_extent(reduced)

La majorité des concepts ont bottom comme fils, ce qui signifie qu'ils ont un seul objet qui partagent beaucoup d'attributs.

**On cherche** :  ceux qui n'ont pas BOTTOM comme fils  
concept bottom : 2031  
(ie les concepts qui sont partagés par plus d'un objet ?)

In [67]:
for ccpt_id in concepts_having_objects.keys():
    if lattice[ccpt_id]["children"] != [2031]:
        print(ccpt_id)
        print_descr(lattice[ccpt_id])

1889
EXTENTS : ['25', '5']
INTENTS : ['a_13', 'a_21', 'a_22', 'a_23', 'a_34', 'a_35', 'a_5', 'a_56', 'a_57', 'r_0', 'r_20', 'r_57', 'r_60', 'r_61', 'r_65', 'r_67', 'r_68', 'r_69']
fathers : [1782, 1707, 1767, 1626, 1483, 1786, 1783, 1787]
children : [1954]



1879
EXTENTS : ['12', '14']
INTENTS : ['a_13', 'a_21', 'a_22', 'a_25', 'a_34', 'a_43', 'a_5', 'a_56', 'a_7', 'a_76', 'r_0', 'r_119', 'r_120', 'r_228', 'r_57', 'r_65', 'r_67', 'r_68', 'r_70']
fathers : [1474, 725, 944, 1595, 1102, 1468, 1766, 1441]
children : [1950]



1750
EXTENTS : ['20', '23']
INTENTS : ['a_0', 'a_56', 'a_57', 'a_60', 'a_61', 'r_28', 'r_57', 'r_65', 'r_80', 'r_81']
fathers : [1553, 1198, 1254, 1259, 1558]
children : [1875]



1600
EXTENTS : ['12', '14', '25', '29', '5', '56', '87']
INTENTS : ['a_13', 'a_21', 'a_22', 'a_34', 'a_5', 'a_56', 'r_0', 'r_57', 'r_65', 'r_67', 'r_68']
fathers : [1346, 1351, 1155, 1156]
children : [1766, 1767]



1409
EXTENTS : ['30', '33', '56']
INTENTS : ['a_56', 'a_57', 'a_58', 'r_123

#####################################################################  #####################################################################  #####################################################################  #####################################################################  #####################################################################  #####################################################################  #####################################################################
# DRAFT BELOW

In [26]:
print_concept_from_object(14, lattice, reduced)
non_empty_extent = get_non_empty_extent(reduced)

CONCEPT :  1950
EXTENTS H : ['14']
INTENTS H : []
INTENT : ['14']
EXTENT : ['a_13', 'a_21', 'a_22', 'a_25', 'a_34', 'a_43', 'a_5', 'a_56', 'a_7', 'a_76', 'r_0', 'r_119', 'r_12', 'r_120', 'r_161', 'r_162', 'r_163', 'r_196', 'r_197', 'r_228', 'r_267', 'r_57', 'r_65', 'r_67', 'r_68', 'r_70']
fathers : [1311, 1353, 1640, 1571, 1574, 1572, 1879, 1354, 911]
children : [2031]




In [None]:
print("=======================")
print("NON EMPTY EXTENT")
show_lattice(non_empty_extent)

In [60]:
get_concept(lattice, 2031)

{'extents': [],
 'intents': ['a_0',
  'a_1',
  'a_10',
  'a_11',
  'a_12',
  'a_13',
  'a_14',
  'a_15',
  'a_16',
  'a_17',
  'a_18',
  'a_19',
  'a_2',
  'a_20',
  'a_21',
  'a_22',
  'a_23',
  'a_24',
  'a_25',
  'a_26',
  'a_27',
  'a_28',
  'a_29',
  'a_3',
  'a_30',
  'a_31',
  'a_32',
  'a_33',
  'a_34',
  'a_35',
  'a_36',
  'a_37',
  'a_38',
  'a_39',
  'a_4',
  'a_40',
  'a_41',
  'a_42',
  'a_43',
  'a_44',
  'a_45',
  'a_46',
  'a_47',
  'a_48',
  'a_49',
  'a_5',
  'a_50',
  'a_51',
  'a_52',
  'a_53',
  'a_54',
  'a_55',
  'a_56',
  'a_57',
  'a_58',
  'a_59',
  'a_6',
  'a_60',
  'a_61',
  'a_62',
  'a_63',
  'a_64',
  'a_65',
  'a_66',
  'a_67',
  'a_68',
  'a_69',
  'a_7',
  'a_70',
  'a_71',
  'a_72',
  'a_73',
  'a_74',
  'a_75',
  'a_76',
  'a_77',
  'a_78',
  'a_79',
  'a_8',
  'a_80',
  'a_81',
  'a_82',
  'a_83',
  'a_84',
  'a_85',
  'a_86',
  'a_87',
  'a_88',
  'a_89',
  'a_9',
  'a_90',
  'a_91',
  'a_92',
  'a_93',
  'a_94',
  'a_95',
  'a_96',
  'a_97',
  '