In [23]:
 import json
import zipfile
import os
import sys
import io
from ast import literal_eval
import pandas as pd
from itertools import chain
from itertools import groupby
import unicodedata
import numpy as np
import collections

from string import punctuation
translator = str.maketrans('','',punctuation) 

In [24]:
class repository: 
    def __init__(self, policy, title, chapter, section, article):
        self.policy = policy
        self.title = title
        self.chapter = chapter
        self.section = section
        self.article = article
        
        
    def __repr__(self): #how to print the repository to the console
        return 'policy:' + self.policy + ' title:' + self.title + ' chapter:' + self.chapter + ' section:' + self.section + ' article:' + self.article          
   
    @classmethod
    def from_repository_name(cls, rep_str):                #2nd initializer that creates a repository object directly from a repository string e.g 'EU_32008R1099_Title_0_Chapter_0_Section_0_Article_03.txt'
        folder_parts = rep_str.split('_')                  #split the string at '_' into parts 
        policy = folder_parts[0] + '_' + folder_parts[1]   #we only want to split at every 2nd '_', so merge the 1. and 2., 3. and 4. again 
        if folder_parts[2] in  ['front', 'Whereas']:       #exeption for the 'whereas' and 'front'
            title = 'front'
            chapter = 'None'
            section = 'None'
            article = 'None'
        else:
            title = folder_parts[2] + '_' + folder_parts[3]
            chapter = folder_parts[4] + '_' + folder_parts[5]
            section = folder_parts[6] + '_' + folder_parts[7]
            article = folder_parts[8] + '_' + folder_parts[9]
        
        return cls(policy,title, chapter, section, article)  #return a repository with the previously defined attributes
    
    def match(self, other):            #checks if the search-criteria defined in repository 'other' is matching the the current repository                                                
        self_value_set = set([x for x in list(self.__dict__.values()) if x != 'None']) #creates a set of all the attributes ignoring 'None'    
        other_value_set = set([x for x in list(other.__dict__.values()) if x != 'None'])
        

        return set(other_value_set).issubset(self_value_set) #returns True if the attributes of the search-criteria is a subset of the attributes of the current directory (=match)
    
class token:
    def __init__(self, start, stop, text, rep, tag_count = 0):
        self.start = start
        self.stop = stop
        self.text = text
        self.rep = rep
        self.tag_count = tag_count
        
    def __repr__(self):
        return 'start:' + str(self.start) + ' stop:' + str(self.stop) + ' text:' + self.text + ' tag_count:' + str(self.tag_count)
        
    
class tag:
    def __init__(self, layer_, type_, tag_, start, stop, text, tokens, rep):
        self.layer_ = layer_
        self.type_ = type_
        self.tag_ = tag_
        self.start = start
        self.stop = stop
        self.text = text
        self.tokens = tokens
        self.rep = rep
        
        
    def __eq__(self, other): 
        if not isinstance(other, tag):
            # don't attempt to compare against unrelated types
            return NotImplemented

        return self.layer_ == other.layer_ and self.type_ == other.type_ and self.tag_ == other.tag_ and self.start == other.start and self.stop == other.stop
    
    def __repr__(self): #for debugging purpose, defines how object is printed
        return "layer:" + self.layer_ + " type:" + self.type_ + " tag:" + self.tag_ + " start:" + str(self.start) + " stop:" + str(self.stop) + " text:" + self.text + '\n'

    def get_start(self):
        return self.start
    
class Evaluator:
    
    def __init__(self, df):
        self.df = df
        
        
    def _get_iterator_all(self): #returns a iterator of all the tags present in the dataframe
        
        return chain.from_iterable(self.df['Tags'])
        
        
    def _get_iterator_conditional_rep(self, conditional_rep, column): #returns a list for all the tags that match the attributes defined in the 'conditional_rep' 
  
        iterator = chain.from_iterable(self.df[column])
        return [x for x in iterator if type(x)!= str and x.rep.match(conditional_rep) == True]
        
        
    def get_tag_list(self, conditional_rep, item, value):
       
        iterator = self._get_iterator_conditional_rep(conditional_rep, 'Tags')
        
        if item == 'None' and value == 'None':
            return [x for x in iterator if type(x)!= str ]
        if item == 'layer':
            return [x for x in iterator if type(x)!= str and x.layer_ == value]
        if item == 'type':
            return [x for x in iterator if type(x)!= str and x.type_ == value]
        if item == 'tag':
            return [x for x in iterator if type(x)!= str and x.tag_ == value]
        
    def get_tag_frequency(self, conditional_rep, item, value):
        return len(self.get_tag_list(conditional_rep, item, value))    

    def get_span_distro(self, conditional_rep, item, value, return_format, level = 'character'):
        
        tag_list = self.get_tag_list(conditional_rep, item, value)
        
        if level == 'character':
            len_list = [(x.stop - x.start) for x in tag_list]
        if level == 'token':
            len_list = [len(x.tokens) for x in tag_list]
            
        len_dict = collections.Counter(len_list)
        
        if return_format == 'dict':
            return dict(sorted(len_dict.items(), key=lambda item: item[1], reverse=True))
            
        
        if return_format == 'list':
            distro_list = []
            for i in range(1,max(len_list)+1):
                distro_list.append(len_list.count(i))
            return distro_list
                      
     
    def get_tag_count(self, conditional_rep, item, value):
        return len(self.get_tag_list(conditional_rep, item, value))
    
    def get_token_list_from_repository(self, conditional_rep):
        token_iterator = self._get_iterator_conditional_rep(conditional_rep, 'Tokens')
        return list(token_iterator)
    
    def get_token_count_from_repository(self, conditional_rep):
        return len(self.get_token_list_from_repository(conditional_rep))
        
       
    
    
    def get_tokens_from_tag_list(self, conditional_rep, item, value):
        tag_list = self.get_tag_list(conditional_rep, item, value)
        return list(chain.from_iterable([x.tokens for x in tag_list]))
    
    
    def get_token_count_from_tag_list(self, conditional_rep, item, value):
        return len(self.get_tokens_from_tag_list(conditional_rep, item, value)) 
    
    
    def most_frequent_labeled_tokens(self, conditional_rep, item, value):
        tag_list = self.get_tag_list(conditional_rep, item, value) #get the taglist of all the tag that match search criteria
        token_iterator = chain.from_iterable([x.tokens for x in tag_list]) #retrieve all the tokens from the tag list and create a iterator (since the list contains sublists)
        token_counter_dict = collections.Counter([x.text for x in token_iterator]) #get a list the text of the token, count the different elements and create a dict
        return dict(sorted(token_counter_dict.items(), key=lambda item: item[1], reverse=True))  #sort the dict by counts
    
    
    def get_label_count_per_token_distro(self, conditional_rep, return_format):
        token_iterator = self._get_iterator_conditional_rep(conditional_rep, 'Tokens')
        token_counter_dict = collections.Counter([x.tag_count for x in token_iterator]) #get a list the text of the token, count the different elements and create a dict
        label_counter_list = [x.tag_count for x in token_iterator]
        
        if return_format == 'dict':
            return dict(sorted(token_counter_dict.items(), key=lambda item: item[1], reverse=True))
        
        if return_format == 'list':
            distro_list = []
            for i in range(0,max(label_counter_list)+1):
                distro_list.append(label_counter_list.count(i))
            return distro_list
         

In [25]:
stat_df = pd.DataFrame(columns = ['Policy', 'Text','Tags', 'Tokens']) #create the initial dataframe
tagsets = ['Policydesigncharacteristics','Technologyandapplicationspecificity','Instrumenttypes']

In [26]:
from definitions import ROOT_DIR


data_path = os.path.join(ROOT_DIR,'data')

annotator_path = os.path.join(data_path , 'annotator_data/annotation')  
path = os.path.join(data_path , 'annotator_data/curation')

subdirs = [o for o in os.listdir(path) if os.path.isdir(path)] #identify all the subdirs which correspond to different articles
os.chdir(path)

In [27]:
for subdir in subdirs:
   
    try:
        archive = zipfile.ZipFile(os.path.join(subdir, str(os.listdir(subdir)[0])), 'r') #decode compressed json in zip file
        json_file_byte = archive.read('CURATION_USER.json')      #this is a binary
        
        json_file_byte_decode = json_file_byte.decode('utf8')    #decode to json
    
        data = json.loads(json_file_byte_decode)
    
    except:
        stat_df = stat_df.append(pd.Series([subdir, 'error', 'error', 'error'], index=stat_df.columns ), ignore_index=True) #append error column if cannot read data

        
    else:
        taglist = []      #create epty list holding all the tags of the aragraph (= article)
        rep = repository.from_repository_name(subdir[0:-4])  #directory name in string format
        sentence = data['_referenced_fss']['1']['sofaString'].lower()  #raw text of the paragraph (all in lower case)
        all_tokens_json = data['_views']['_InitialView']['Token']
        all_tokens_json[0]['begin'] = 0                            #the first token is missing the beginning
        token_list = [token(x['begin'], x['end'], sentence[x['begin']:x['end']], rep) for x in all_tokens_json]
        sentence_normalized = normalize_and_replace_text(sentence) 
        
        for category in data['_views']['_InitialView']:  #loop trough the custom layers
  
            if category in tagsets:
                    
                for annotation in data['_views']['_InitialView'][category]: #loop trough all the tags
                    
                    type_ = list(annotation.keys())[-1]     #this part handles empty annotations. The last entry of the dict usually contains the type and the tag.
                                                            #for empty tags, the last entry is a integer
                    if type(type_) != str:                  
                        type_ = ''
                    
                    tag_ = list(annotation.values())[-1]
                    if type(tag_) != str:
                        tag_ = ''
                        
                    start = annotation['begin']
                    stop = annotation['end']
                    tag_tokens = [x for x in token_list if  x.start >= start and x.stop <= stop]
                    
                    for tag_token in tag_tokens:
                        tag_token.tag_count +=1
                    
                    taglist.append(tag(category, type_, tag_ , start ,stop , sentence[start:stop], tag_tokens, rep))
                    
        stat_df = stat_df.append(pd.Series([subdir[0:-4], sentence ,taglist, token_list], index=stat_df.columns ), ignore_index=True)     
stat_df['tokens cleaned'] = stat_df['Text'].apply(clean_text)  

In [35]:
new_index = np.arange(0, len(stat_df)-1)
stat_df.reindex(new_index)

stat_df.head()

Unnamed: 0,Policy,Text,Tags,Tokens,tokens cleaned
0,EU_32006L0032_Title_0_Chapter_1_Section_0_Arti...,article 3\r\ndefinitions\r\nfor the purposes o...,[layer:Instrumenttypes type:InstrumentType tag...,"[start:0 stop:7 text:article tag_count:0, star...","[article, #, definitions, for, the, purposes, ..."
1,EU_32009L0028_Title_0_Chapter_0_Section_0_Arti...,article 7\r\njoint projects between member sta...,[layer:Policydesigncharacteristics type:Actor ...,"[start:0 stop:7 text:article tag_count:0, star...","[article, #, joint, projects, between, member,..."
2,EU_32019L0944_Title_0_Chapter_6_Section_3_Arti...,article 47\r\nindependence of the transmission...,[layer:Instrumenttypes type:InstrumentType tag...,"[start:0 stop:7 text:article tag_count:0, star...","[article, #, independence, of, the, transmissi..."
3,EU_32019R0631_Title_0_Chapter_0_Section_0_Arti...,article 14\r\nadjustment of m0 and tm0 values\...,[layer:Instrumenttypes type:InstrumentType tag...,"[start:0 stop:7 text:article tag_count:0, star...","[article, #, adjustment, of, m0, and, tm0, val..."
4,EU_32018R1999_Title_0_Chapter_2_Section_0_Arti...,article 3\r\nintegrated national energy and cl...,[layer:Instrumenttypes type:InstrumentType tag...,"[start:0 stop:7 text:article tag_count:0, star...","[article, #, integrated, national, energy, and..."


In [160]:
stat_df['Policy'][0]

'EU_32009L0028_Title_0_Chapter_0_Section_0_Article_07'

In [161]:
hello = stat_df['Tags'][0]
hello

[class:Policydesigncharacteristics type:Actor tag:Addressee_default start:34 stop:47 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Addressee_default start:66 stop:79 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Addressee_sector start:247 stop:264 text:private operators,
 class:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:272 stop:285 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:303 stop:313 text:Commission,
 class:Policydesigncharacteristics type:Resource tag:Resource_Other start:345 stop:658 text:electricity, heating or cooling from renewable energy sources produced by any joint project in their territory, that became operational after 25 June 2009, or by the increased capacity of an installation that was refurbished after that date, which is to be regarded as counting towards the national overall target,
 class:Policydesigncharacteristics type:Objective tag:Obj

In [162]:
test_eval = Evaluator(stat_df)
test_dir = repository('EU_32009L0028', 'None', 'Chapter_0', 'None', 'Article_07')
test_eval.get_tag_list(test_dir, 'None', 'None')




[class:Policydesigncharacteristics type:Actor tag:Addressee_default start:34 stop:47 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Addressee_default start:66 stop:79 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Addressee_sector start:247 stop:264 text:private operators,
 class:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:272 stop:285 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:303 stop:313 text:Commission,
 class:Policydesigncharacteristics type:Resource tag:Resource_Other start:345 stop:658 text:electricity, heating or cooling from renewable energy sources produced by any joint project in their territory, that became operational after 25 June 2009, or by the increased capacity of an installation that was refurbished after that date, which is to be regarded as counting towards the national overall target,
 class:Policydesigncharacteristics type:Objective tag:Obj

In [163]:
[x.tag_count for x in stat_df['Tokens'][0] if x.tag_count == 5 ]
   

[5, 5, 5, 5, 5]

In [164]:
test_eval.get_tag_list(test_dir, 'None', 'None')

[class:Policydesigncharacteristics type:Actor tag:Addressee_default start:34 stop:47 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Addressee_default start:66 stop:79 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Addressee_sector start:247 stop:264 text:private operators,
 class:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:272 stop:285 text:Member States,
 class:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:303 stop:313 text:Commission,
 class:Policydesigncharacteristics type:Resource tag:Resource_Other start:345 stop:658 text:electricity, heating or cooling from renewable energy sources produced by any joint project in their territory, that became operational after 25 June 2009, or by the increased capacity of an installation that was refurbished after that date, which is to be regarded as counting towards the national overall target,
 class:Policydesigncharacteristics type:Objective tag:Obj

In [165]:
test_eval.get_tag_count(test_dir, 'None', 'None')

37

In [166]:
test_eval.get_span_distro(test_dir, 'None', 'None', 'dict')

{12: 9,
 13: 3,
 23: 3,
 24: 3,
 75: 2,
 92: 2,
 61: 2,
 17: 1,
 10: 1,
 313: 1,
 20: 1,
 196: 1,
 100: 1,
 136: 1,
 22: 1,
 36: 1,
 91: 1,
 64: 1,
 33: 1,
 58: 1}

In [167]:
test_eval.get_tokens_from_tag_list(test_dir, 'None', 'None')

[start:34 stop:40 text:Member tag_count:1,
 start:41 stop:47 text:States tag_count:1,
 start:66 stop:72 text:Member tag_count:1,
 start:73 stop:79 text:States tag_count:1,
 start:247 stop:254 text:private tag_count:1,
 start:255 stop:264 text:operators tag_count:1,
 start:272 stop:278 text:Member tag_count:1,
 start:279 stop:285 text:States tag_count:1,
 start:303 stop:313 text:Commission tag_count:1,
 start:345 stop:356 text:electricity tag_count:3,
 start:356 stop:357 text:, tag_count:3,
 start:358 stop:365 text:heating tag_count:3,
 start:366 stop:368 text:or tag_count:3,
 start:369 stop:376 text:cooling tag_count:3,
 start:377 stop:381 text:from tag_count:3,
 start:382 stop:391 text:renewable tag_count:4,
 start:392 stop:398 text:energy tag_count:4,
 start:399 stop:406 text:sources tag_count:4,
 start:407 stop:415 text:produced tag_count:2,
 start:416 stop:418 text:by tag_count:2,
 start:419 stop:422 text:any tag_count:2,
 start:423 stop:428 text:joint tag_count:2,
 start:429 stop:

In [168]:
test_eval.get_token_count_from_tag_list(test_dir, 'None', 'None')

275

In [169]:
test_eval.most_frequent_labeled_tokens(test_dir, 'None', 'None')

{'or': 23,
 'the': 15,
 'installation': 13,
 'electricity': 12,
 'heating': 12,
 'cooling': 12,
 'from': 12,
 'renewable': 11,
 'energy': 11,
 'sources': 11,
 'Member': 10,
 'produced': 9,
 ',': 8,
 'State': 7,
 'by': 6,
 'national': 6,
 'overall': 6,
 'target': 6,
 'of': 5,
 'States': 3,
 'joint': 3,
 'project': 3,
 'that': 3,
 'which': 3,
 'is': 3,
 'to': 3,
 'be': 3,
 'regarded': 3,
 'as': 3,
 'counting': 3,
 'towards': 3,
 'any': 2,
 'after': 2,
 'refurbished': 2,
 'specify': 2,
 'proportion': 2,
 'amount': 2,
 'extend': 2,
 'beyond': 2,
 '2020': 2,
 'private': 1,
 'operators': 1,
 'Commission': 1,
 'in': 1,
 'their': 1,
 'territory': 1,
 'became': 1,
 'operational': 1,
 '25': 1,
 '\xa0': 1,
 'June': 1,
 '2009': 1,
 'increased': 1,
 'capacity': 1,
 'an': 1,
 'was': 1,
 'date': 1,
 'measuring': 1,
 'compliance': 1,
 'describe': 1,
 'proposed': 1,
 'identify': 1,
 'another': 1,
 'not': 1,
 'may': 1,
 'production': 1}

In [170]:
test_eval.get_label_count_per_token_distro(test_dir, 'dict')

{0: 174, 1: 81, 2: 39, 3: 17, 4: 10, 5: 5}

In [171]:
test_eval.get_label_count_per_token_distro1(test_dir, 'None', 'None','dict')

{1: 81, 2: 78, 3: 51, 4: 40, 5: 25}

In [172]:
import matplotlib.pyplot as plt
plt.plot(test_eval.get_span_distro(hello,'None', 'None', 'list'))
plt.show()


AttributeError: 'list' object has no attribute '__dict__'

In [30]:
test_eval.get_span_distro(hello,'class', 'Instrumenttypes', 'list')

NameError: name 'hello' is not defined

In [31]:
test_eval.most_frequent_spans(hello,'tag', 'Tech_Other')

AttributeError: 'Evaluator' object has no attribute 'most_frequent_spans'

In [32]:
testdir = repository('EU_32008R1099', 'None', 'None', 'None', 'Article_06.txt')
test_eval = Evaluator(stat_df)
test_l = test_eval.get_tag_list(hello,'None', 'None')


NameError: name 'hello' is not defined

In [12]:
 test = "Article 6\r\nQuality assessment and reports\r\n1.   Member States shall ensure the quality of the data transmitted.\r\n2.   Every reasonable effort shall be undertaken to ensure coherence between energy data declared in accordance with Annex B and data declared in accordance with Commission Decision 2005/166/EC of 10 February 2005 laying down the rules for implementing Decision No 280/2004/EC of the European Parliament and of the Council concerning a mechanism for monitoring Community greenhouse gas emissions and for implementing the Kyoto Protocol (12).\r\n3.   For the purposes of this Regulation, the following quality assessment dimensions shall apply to the data to be transmitted:\r\n(a)\r\n‘relevance’ shall refer to the degree to which statistics meet current and potential needs of the users;\r\n(b)\r\n‘accuracy’ shall refer to the closeness of estimates to the unknown true values;\r\n(c)\r\n‘timeliness’ shall refer to the delay between the availability of the information and the event or phenomenon it describes;\r\n(d)\r\n‘punctuality’ shall refer to the delay between the date of the release of the data and the target date when it should have been delivered;\r\n(e)\r\n‘accessibility’ and ‘clarity’ shall refer to the conditions and modalities by which users can obtain, use and interpret data;\r\n(f)\r\n‘comparability’ shall refer to the measurement of the impact of differences in applied statistical concepts and measurement tools and procedures where statistics are compared between geographical areas, sectoral domains or over time;\r\n(g)\r\n‘coherence’ shall refer to the adequacy of the data to be reliably combined in different ways and for various uses.\r\n4.   Every five years, Member States shall provide the Commission (Eurostat) with a report on the quality of the data transmitted as well as on any methodological changes that have been made.\r\n5.   Within six months of receipt of a request from the Commission (Eurostat), and in order to allow it to assess the quality of the data transmitted, Member States shall send to the Commission (Eurostat) a report containing any relevant information concerning the implementation of this Regulation.\r\n"

In [43]:
test[343:348]

'rules'

In [26]:
search_dir = repository('None', 'None', 'None', 'section1', 'article1')
testdir_else = repository('None', 'None', 'chapter1', 'section1', 'article1')
search_dir_list = list(search_dir.__dict__.values())
testdir_else_list = list(testdir_else.__dict__.values())
print(search_dir_list)
print(testdir_else_list)
#print([x for x in search_dir_list if x in testdir_else_list and x  != 'None'])
set([x for x in search_dir_list if x != 'None']).issubset(set(testdir_else_list)) 
        
        

['None', 'None', 'None', 'section1', 'article1']
['None', 'None', 'chapter1', 'section1', 'article1']


True

In [437]:
list(test_dir.__dict__.values())

NameError: name 'test_dir' is not defined

In [21]:
stat_df.loc[stat_df['Policy'] == 'EU_32019R0631_Title_0_Chapter_0_Section_0_Article_16.','Tags'].iloc[0]

IndexError: single positional indexer is out-of-bounds

In [31]:
text = stat_df.loc[stat_df['Policy'] == 'EU_32008R1099_Title_0_Chapter_0_Section_0_Article_06','Text'].iloc[0]

In [35]:
text[832:841]

's of esti'

In [94]:
subset = stat_df['Tags'][0:2]
for set_ in subset:
    for entry in set_:
        print(entry.class_)




Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity
Technologyandapplicationspecificity


In [97]:
[x for b in subset for x in b if x.class_ == 'Technologyandapplicationspecificity'] 

[<__main__.tag at 0x7ffa858349d0>,
 <__main__.tag at 0x7ffac3078910>,
 <__main__.tag at 0x7ffac3078310>,
 <__main__.tag at 0x7ffac3078790>,
 <__main__.tag at 0x7ffac3078b90>,
 <__main__.tag at 0x7ffac3078490>,
 <__main__.tag at 0x7ffac30786d0>,
 <__main__.tag at 0x7ffac3078c90>,
 <__main__.tag at 0x7ffac3078b50>,
 <__main__.tag at 0x7ffac3078a90>,
 <__main__.tag at 0x7ffac3078890>,
 <__main__.tag at 0x7ffac30787d0>,
 <__main__.tag at 0x7ffac3078810>,
 <__main__.tag at 0x7ffac3078ed0>,
 <__main__.tag at 0x7ffa85834f50>,
 <__main__.tag at 0x7ffa85834f90>,
 <__main__.tag at 0x7ffa85834ed0>,
 <__main__.tag at 0x7ffa85834d50>,
 <__main__.tag at 0x7ffa85834610>,
 <__main__.tag at 0x7ffa85834e90>,
 <__main__.tag at 0x7ffa85834910>,
 <__main__.tag at 0x7ffa85834650>,
 <__main__.tag at 0x7ffa858348d0>,
 <__main__.tag at 0x7ffa85834590>,
 <__main__.tag at 0x7ffa85834950>,
 <__main__.tag at 0x7ffa85834990>]

In [110]:
stat_df['']

NameError: name '__main__' is not defined

In [51]:
len([x for x in chain.from_iterable(stat_df['Tags']) if type(x)!= str and x.class_ == 'Technologyandapplicationspecificity']) 

1344

In [20]:
list_ = []
for element in data['_views']['_InitialView']['Token']:
    
    
    

[{'sofa': 1, 'end': 7},
 {'sofa': 1, 'begin': 7, 'end': 8},
 {'sofa': 1, 'begin': 8, 'end': 9},
 {'sofa': 1, 'begin': 11, 'end': 16},
 {'sofa': 1, 'begin': 17, 'end': 25},
 {'sofa': 1, 'begin': 26, 'end': 33},
 {'sofa': 1, 'begin': 34, 'end': 40},
 {'sofa': 1, 'begin': 41, 'end': 47},
 {'sofa': 1, 'begin': 49, 'end': 50},
 {'sofa': 1, 'begin': 50, 'end': 51},
 {'sofa': 1, 'begin': 51, 'end': 54},
 {'sofa': 1, 'begin': 54, 'end': 57},
 {'sofa': 1, 'begin': 58, 'end': 60},
 {'sofa': 1, 'begin': 61, 'end': 65},
 {'sofa': 1, 'begin': 66, 'end': 72},
 {'sofa': 1, 'begin': 73, 'end': 79},
 {'sofa': 1, 'begin': 80, 'end': 83},
 {'sofa': 1, 'begin': 84, 'end': 93},
 {'sofa': 1, 'begin': 94, 'end': 96},
 {'sofa': 1, 'begin': 97, 'end': 100},
 {'sofa': 1, 'begin': 101, 'end': 106},
 {'sofa': 1, 'begin': 107, 'end': 109},
 {'sofa': 1, 'begin': 110, 'end': 115},
 {'sofa': 1, 'begin': 116, 'end': 124},
 {'sofa': 1, 'begin': 125, 'end': 133},
 {'sofa': 1, 'begin': 134, 'end': 136},
 {'sofa': 1, 'beg

In [23]:
[x['begin'] for x in  data['_views']['_InitialView']['Token']]

KeyError: 'begin'

In [21]:
[x['begin'] for x in  data['_views']['_InitialView']['Token'] if int(x['begin']) < 50]

KeyError: 'begin'

In [72]:
list(data['_views']['_InitialView']['Technologyandapplicationspecificity'][0].values())

[1, 57, 88, 'Tech_LowCarbon']

In [74]:
test_df = pd.DataFrame(columns = ['text'], index = np.arange(0,2))
test_df['text'].loc[0] = 'hello peter ist mee'
test_df['text'].loc[1] = 'why I am su sotun'

In [75]:
test_df.head()

Unnamed: 0,text
0,hello peter ist mee
1,why I am su sotun


In [81]:
test_df['text'] = test_df['text'].apply(clean_text)

In [85]:

len(list(chain.from_iterable(test_df['text'])))

9

In [38]:
class Negator(object):
    def __eq__(self,other):
        return not other

thing = Negator()
print(thing == None)    #True
print(thing is None)    #False

True
False


In [199]:
hello = 1 if 1 < 2

SyntaxError: invalid syntax (<ipython-input-199-833455e983d3>, line 1)

In [75]:
all_tokens = data['_views']['_InitialView']['Token']

In [76]:
all_tokens

[{'sofa': 1, 'end': 7, 'begin': 0},
 {'sofa': 1, 'begin': 7, 'end': 8},
 {'sofa': 1, 'begin': 8, 'end': 10},
 {'sofa': 1, 'begin': 12, 'end': 21},
 {'sofa': 1, 'begin': 22, 'end': 24},
 {'sofa': 1, 'begin': 25, 'end': 28},
 {'sofa': 1, 'begin': 29, 'end': 35},
 {'sofa': 1, 'begin': 36, 'end': 42},
 {'sofa': 1, 'begin': 44, 'end': 45},
 {'sofa': 1, 'begin': 45, 'end': 46},
 {'sofa': 1, 'begin': 46, 'end': 49},
 {'sofa': 1, 'begin': 49, 'end': 53},
 {'sofa': 1, 'begin': 54, 'end': 60},
 {'sofa': 1, 'begin': 61, 'end': 66},
 {'sofa': 1, 'begin': 67, 'end': 72},
 {'sofa': 1, 'begin': 73, 'end': 79},
 {'sofa': 1, 'begin': 80, 'end': 81},
 {'sofa': 1, 'begin': 82, 'end': 88},
 {'sofa': 1, 'begin': 89, 'end': 91},
 {'sofa': 1, 'begin': 92, 'end': 95},
 {'sofa': 1, 'begin': 96, 'end': 106},
 {'sofa': 1, 'begin': 107, 'end': 109},
 {'sofa': 1, 'begin': 110, 'end': 118},
 {'sofa': 1, 'begin': 119, 'end': 121},
 {'sofa': 1, 'begin': 122, 'end': 125},
 {'sofa': 1, 'begin': 126, 'end': 135},
 {'sof

In [77]:
all_tokens[0]['begin'] = 0
start = 34
end = 47

In [78]:
[x for x in all_tokens if  x['begin'] >= start and x['end'] <= end]

[{'sofa': 1, 'begin': 36, 'end': 42},
 {'sofa': 1, 'begin': 44, 'end': 45},
 {'sofa': 1, 'begin': 45, 'end': 46}]

In [80]:
ls = [token(x['begin'], x['end'], data['_referenced_fss']['1']['sofaString'][x['begin']:x['end']]) for x in all_tokens if  x['begin'] >= start and x['end'] <= end]

In [86]:
ls[2].text

'.'

In [44]:
[x for x in all_tokens if x['begin'] > 34]

    

[{'sofa': 1, 'begin': 41, 'end': 47},
 {'sofa': 1, 'begin': 49, 'end': 50},
 {'sofa': 1, 'begin': 50, 'end': 51}]

In [57]:
data['_referenced_fss']['1']['sofaString'][34:47]

'Member States\r'

In [51]:
data['_views']['_InitialView']['Policydesigncharacteristics']

[{'sofa': 1, 'begin': 34, 'end': 47, 'Actor': 'Addressee_default'},
 {'sofa': 1, 'begin': 66, 'end': 79, 'Actor': 'Addressee_default'},
 {'sofa': 1, 'begin': 247, 'end': 264, 'Actor': 'Addressee_sector'},
 {'sofa': 1, 'begin': 272, 'end': 285, 'Actor': 'Addressee_monitored'},
 {'sofa': 1, 'begin': 303, 'end': 313, 'Actor': 'Authority_monitoring'},
 {'sofa': 1, 'begin': 345, 'end': 658, 'Resource': 'Resource_Other'},
 {'sofa': 1, 'begin': 635, 'end': 658, 'Objective': 'Objective_QualIntention'},
 {'sofa': 1, 'begin': 670, 'end': 682, 'Actor': 'Addressee_resource'},
 {'sofa': 1, 'begin': 703, 'end': 723, 'Compliance': 'Form_monitoring'},
 {'sofa': 1, 'begin': 828, 'end': 903, 'Compliance': 'Form_monitoring'},
 {'sofa': 1, 'begin': 911, 'end': 1107, 'Compliance': 'Form_monitoring'},
 {'sofa': 1, 'begin': 911, 'end': 1011, 'Compliance': 'Form_monitoring'},
 {'sofa': 1, 'begin': 947, 'end': 1083, 'Resource': 'Resource_Other'},
 {'sofa': 1,
  'begin': 1060,
  'end': 1083,
  'Objective': 'Obj

In [58]:
text = 'hello'
text[:2]

'he'

In [61]:
token

{'sofa': 1, 'begin': 8, 'end': 9}

__main__.token

In [40]:
token_1 = token(2,3,'hello')
token_2 = token(3,4,'peter')

In [41]:
list_l = []
list_l.append(token_1)
list_l.append(token_2)


In [42]:
list_l

[start:2 stop:3 text:hello, start:3 stop:4 text:peter]

In [43]:
list_l[0].text = 'sick'

In [44]:
token_1

start:2 stop:3 text:sick