In [1]:
#load data

In [2]:
import os

LABELS_JSON_FILENAME = "labels.json"
DOCUMENTS_DOCX_FOLDER = 'Training Set for Hugo'
TEXTFILES_FOLDER = os.path.join(os.getcwd(), 'documents_txt')
RAW_DATA_JSON_FILE = 'raw_data.json'
PREPROCESSED_JSON_FILE = 'raw_data.json'

In [3]:
from csv import DictReader

def read_labels(filename:str = "labels.csv"):
    data = []

    with open(filename, 'rt') as f:
        reader = DictReader(f, delimiter='\t')
        for line in reader:
            data.append(line)
            
    print(f"{len(data)} lines loaded")
    return data
            
def process_labels(labels):
    for line in labels:
        if not line['Document'].endswith(".docx"):
            line["Document"] = line["Document"].strip() + ".docx"
        
        line['Dimension 1st'] = str(int(line['Dimension 1st']))
        line['Dimension 2nd'] = str(int(line['Dimension 2nd']))
    

In [4]:
label_names = {
    "1": "Technical",
    "2": "Performative",
    "3": "Procedural",
    "4": "Moral"
}

In [5]:
import json

labels = read_labels()
process_labels(labels)
with open(LABELS_JSON_FILENAME, 'wt') as out:
    json.dump(labels, out)
    
!ls

1281 lines loaded
'Coding for Hugo.xlsx'	 labels.csv		  raw_data.json
 documents_txt		 labels.json		 'Training Set for Hugo'
 dpc2vec.ipynb		 Machine_Learning.ipynb
 experiments_results	 pycaret.ipynb


In [6]:
import json

with open(LABELS_JSON_FILENAME, 'rt') as f:
    labels = json.load(f)
    
print(len(labels))
print(labels[:5])

1281
[{'Document': 'ACER 2019 1.docx', 'Dimension 1st': '1', 'Dimension 2nd': '1', '': ''}, {'Document': 'ACER 2019 2.docx', 'Dimension 1st': '1', 'Dimension 2nd': '2', '': ''}, {'Document': 'ACER 2019 3.docx', 'Dimension 1st': '2', 'Dimension 2nd': '3', '': ''}, {'Document': 'ACER 2019 4.docx', 'Dimension 1st': '1', 'Dimension 2nd': '1', '': ''}, {'Document': 'ACER 2019 5.docx', 'Dimension 1st': '3', 'Dimension 2nd': '3', '': ''}]


In [7]:
import docx
from tqdm import tqdm_notebook

documents_docx_list = [os.path.join(DOCUMENTS_DOCX_FOLDER, filename) for filename in os.listdir(DOCUMENTS_DOCX_FOLDER)]

def getText(filename):
    """
    Extract text from a docx file
    """
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def create_textfilename(docx_file_name):
    """
    Generate a filename of the txt file based on the filename of the docx file
    """
    pathlist = list(os.path.split(docx_file_name))
    pathlist[-2] = TEXTFILES_FOLDER
    newpath = os.path.join(*pathlist)
    filename, file_extension = os.path.splitext(newpath)
    newfilenm = filename + '.txt'
    return newfilenm

def read_texts():
    """
    Read the text from all docx files
    Returns a dict with keys = docx filenames and values are texts
    Also stores all texts as .txt
    """
    texts = {}
    for filename in tqdm_notebook(documents_docx_list):
        if '~$' in filename:
            continue
        text = getText(filename)
        textfilenm = create_textfilename(filename)
        filenm_short = os.path.splitext(os.path.split(textfilenm)[-1])[0]
        texts[filenm_short] = text
        
        with open(textfilenm, 'wt') as out:
            out.write(text)
        
    return texts
        
texts = read_texts()        
print(len(texts.keys()))
print(list(texts.items())[:3])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm_notebook(documents_docx_list):


HBox(children=(FloatProgress(value=0.0, max=1307.0), HTML(value='')))


1281
[('ECSEL JU 2019 6', 'Advanced Metrology & Characterisation for 3D CMOS\n\nAs nano-electronics technology is moving beyond the boundaries of (strained) silicon in planar or finFETs, new 3D device architectures and new materials bring major metrology and characterization challenges which cannot be met by pushing the present techniques to their limits. 3DAM “3D Advanced Metrology and materials for advanced devices” is an EU-ECSELfunded pathfinding and assessment project focusing on innovations and progress in metrology and characterization related to the latest generation of 3D front-end of line (FEOL) and back-end of line (BEOL) structures (fins, nanowires, TSVs) as well as 2D materials:\n\n• Dimensional metrology: 3D-SPM, CD-SEM, OCD\n• Structural analysis: Electron Tomography, PL & CL, SHG, X-ray NanoCT\n• Compositional/dopant analysis: SIMS, APT, STEM-EDX and EELS, IRR, Raman, HRXRD\n• Carrier distribution and mobility: 3D-SSRM, micro-multi-point probes, THz spectroscopy\n• Str

In [8]:
from tqdm import tqdm_notebook

for label_line in tqdm_notebook(labels):
    document_name = label_line["Document"]
    document_id = os.path.splitext(document_name)[0]
    label_line['id'] = document_id
    
    text = texts[document_id]
    label_line['text'] = text

raw_data = {line['id']: line for line in labels}
with open(RAW_DATA_JSON_FILE, 'wt') as out:
    json.dump(raw_data, out)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for label_line in tqdm_notebook(labels):


HBox(children=(FloatProgress(value=0.0, max=1281.0), HTML(value='')))




In [9]:
institutions = set()
for key in raw_data.keys():
    institution = key.split(' ')[0].lower()
    institutions.add(institution)
    
print(institutions)
#TODO: add more institutions

{'cedefop', 'eurojust', 'eea', 'enisa', 'ecdc', 'satcen', 'f4e', 'eba', 'bbi', 'eiopa', 'ema', 'fch', 'cdt', 'europol', 'easo', 'efca', 'clean', 'frontex', 's2r', 'eit', 'etf', 'ecsel', 'eda', 'imi', 'acer', 'echa', 'cepol', 'fra', 'berec', 'era', 'emcdda', 'gsa', 'efsa', 'eu-osha', 'emsa', 'cvpo', 'sesar', 'eu-lisa', 'esma', 'easa', 'euipo', 'eige', 'sbr', 'eurofound'}


In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
stopWords = set(stopwords.words('english'))


remove_digits = str.maketrans(' ', ' ', string.digits)
remove_punct = str.maketrans(' ', ' ', string.punctuation)
# remove_stopwords = str.maketrans(' ', ' ', stopWords)



lemmatizer = WordNetLemmatizer()

def process_word(word:str):
    word = word.lower()
    word = lemmatizer.lemmatize(word)
    
    return word

# def remove_word(word):
#     if word in stopWords:
#         return True
#     elif word.startswith('\\u'):
#         return True
#     elif word in "!@#$%*()_+-=?/<,>.":
#         return True
#     return False

def preprocess_text(text):
    text = text.lower()
    text = text.translate(remove_punct)
    text = text.translate(remove_digits)
#     text = text.translate(remove_stopwords)
    text = re.sub(r'\u200b', ' ', text)
    text = re.sub(r'\n', ' ', text)
    wordlist = word_tokenize(text)
    wordlist = [lemmatizer.lemmatize(word) for word in wordlist if not (word in stopWords or word in institutions)]
    
    processed_text = ' '.join(wordlist)
    
    return processed_text
#     input(wordlist)
    


In [11]:
for identifier, data_line in tqdm_notebook(raw_data.items()):
    text = data_line['text']
    processed_text = preprocess_text(text)
    raw_data[identifier]['processed_text']=processed_text
    
print(list(raw_data.items())[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for identifier, data_line in tqdm_notebook(raw_data.items()):


HBox(children=(FloatProgress(value=0.0, max=1281.0), HTML(value='')))


('ACER 2019 1', {'Document': 'ACER 2019 1.docx', 'Dimension 1st': '1', 'Dimension 2nd': '1', '': '', 'id': 'ACER 2019 1', 'text': "ACER analyses national methodologies for electricity transmission tariffs in the EU\n\n23/12/2019\nThe European Union Agency for the Cooperation of Energy Regulators (ACER) released today a report reviewing the status of methodologies for electricity transmission tariffs in the European Union Member States. \n\nThis report\u200b is a contribution towards the Clean Energy Package objective to increase transparency and comparability in tariff-setting. The report also presents the findings of ACER's monitoring of transmission charges paid by producers. The status review of the distribution tariff structures and the identification of best tariff practices will be subject to future ACER reports. Find below the main findings in different areas.\n\nMethodology setting\n\nIn all but three jurisdictions/countries the corresponding National Regulatory Authority (NRA

In [12]:
list(raw_data.items())[10][1].keys()

dict_keys(['Document', 'Dimension 1st', 'Dimension 2nd', '', 'id', 'text', 'processed_text'])

In [13]:
import pandas as pd

pandas_dict = {
    'Document':[], 
    'Dimension 1st':[], 
    'Dimension 2nd':[], 
    'id':[], 
    'text':[], 
    'processed_text':[]
}

for i_d, dd in tqdm_notebook(raw_data.items()):
    pandas_dict["Document"].append(dd['Document'])
    pandas_dict['Dimension 1st'].append(dd['Dimension 1st'])
    pandas_dict["Dimension 2nd"].append(dd['Dimension 2nd'])
    pandas_dict["id"].append(dd['id'])
    pandas_dict["text"].append(dd['text'])
    pandas_dict["processed_text"].append(dd['processed_text'])

data_df = pd.DataFrame.from_dict(pandas_dict)   

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i_d, dd in tqdm_notebook(raw_data.items()):


HBox(children=(FloatProgress(value=0.0, max=1281.0), HTML(value='')))




In [14]:
data_df.head()

Unnamed: 0,Document,Dimension 1st,Dimension 2nd,id,text,processed_text
0,ACER 2019 1.docx,1,1,ACER 2019 1,ACER analyses national methodologies for elect...,analysis national methodology electricity tran...
1,ACER 2019 2.docx,1,2,ACER 2019 2,Capacity mechanisms: ACER publishes technical ...,capacity mechanism publishes technical guidanc...
2,ACER 2019 3.docx,2,3,ACER 2019 3,Jochen Penker and Jurijs Spiridonovs elected C...,jochen penker jurijs spiridonovs elected chair...
3,ACER 2019 4.docx,1,1,ACER 2019 4,ACER publishes its annual monitoring report on...,publishes annual monitoring report intertransm...
4,ACER 2019 5.docx,3,3,ACER 2019 5,ACER to launch three public consultations on i...,launch three public consultation implementatio...


In [24]:
import pycaret.nlp as pcnlp

In [21]:
mystopwords = list(set(list(stopWords) + list(institutions)))

In [22]:
nlp1 = pcnlp.setup(data=data_df, target = 'text', custom_stopwords = mystopwords)

Description,Value
session_id,8372
# Documents,1281
Vocab Size,10187
Custom Stopwords,True


In [50]:
m1 = pcnlp.create_model("lda", num_topics=10)
lda_data = assign_model(m1)

In [51]:
m2 = pcnlp.create_model("nmf", num_topics=10)
nmf_data = assign_model(m2)

In [52]:
pcnlp.evaluate_model(m1)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [53]:
lda_data.drop(['id', 'Dimension 2nd', 'Dominant_Topic','Perc_Dominant_Topic'], axis=1, inplace=True)
lda_data.head()

Unnamed: 0,Document,Dimension 1st,text,processed_text,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9
0,ACER 2019 1.docx,1,analyse cooperation_energy_regulator release t...,analysis national methodology electricity tran...,0.000134,0.01452,0.489396,0.006608,0.000256,9.7e-05,6.7e-05,0.375955,0.097181,0.015785
1,ACER 2019 2.docx,1,publish technical guidance co emission_limit m...,capacity mechanism publishes technical guidanc...,0.000199,0.081519,0.558031,0.063816,0.000379,0.000143,0.0001,0.131481,0.154975,0.009358
2,ACER 2019 3.docx,2,cooperation_energy_regulator elect unanimously...,jochen penker jurijs spiridonovs elected chair...,0.011486,0.002228,0.146636,0.007109,0.000308,0.000117,8.1e-05,0.249889,0.065237,0.51691
3,ACER 2019 4.docx,1,publish annual monitoring report operator comp...,publishes annual monitoring report intertransm...,0.000341,0.004891,0.281995,0.014042,0.00065,0.000246,0.000171,0.633542,0.001033,0.063091
4,ACER 2019 5.docx,3,launch public_consultation implementation ener...,launch three public consultation implementatio...,0.000244,0.034972,0.570525,0.091693,0.000466,0.000176,0.000123,0.167475,0.075043,0.059282


In [54]:
import pycaret.classification as pcclass

In [55]:
pce_1 = pcclass.setup(data=lda_data, target="Dimension 1st", session_id=5, train_size=.85)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,5
1,Target Type,Multiclass
2,Label Encoded,
3,Original Data,"(1281, 14)"
4,Missing Values,False
5,Numeric Features,10
6,Categorical Features,3
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [56]:
pcclass.compare_models()

IntProgress(value=0, description='Processing: ', max=170)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Gradient Boosting Classifier,0.6333,0.0,0.4143,0.6016,0.5891,0.3161
1,Logistic Regression,0.6094,0.0,0.3553,0.5498,0.5325,0.2259
2,Ridge Classifier,0.6057,0.0,0.3497,0.5268,0.5202,0.2182
3,Ada Boost Classifier,0.5993,0.0,0.4121,0.5773,0.569,0.2821
4,SVM - Linear Kernel,0.5928,0.0,0.3766,0.5206,0.5336,0.2486
5,Random Forest Classifier,0.5818,0.0,0.3722,0.5531,0.5387,0.2218
6,K Neighbors Classifier,0.5138,0.0,0.3604,0.4955,0.4984,0.1647
7,Decision Tree Classifier,0.4642,0.0,0.402,0.556,0.4926,0.1936
8,Quadratic Discriminant Analysis,0.1958,0.0,0.2516,0.1482,0.1222,0.0024
9,Naive Bayes,0.0588,0.0,0.2549,0.0682,0.0127,0.0031


LinAlgError: SVD did not converge

In [60]:
from pycaret.classification import *

In [61]:
dir()

['DOCUMENTS_DOCX_FOLDER',
 'DictReader',
 'In',
 'LABELS_JSON_FILENAME',
 'Out',
 'PREPROCESSED_JSON_FILE',
 'RAW_DATA_JSON_FILE',
 'TEXTFILES_FOLDER',
 'WordNetLemmatizer',
 'X',
 'X_test',
 'X_train',
 '_',
 '_12',
 '_14',
 '_30',
 '_34',
 '_41',
 '_46',
 '_53',
 '_59',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_exit_code',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i19',
 '_i2',
 '_i20',
 '_i21',
 '_i22',
 '_i23',
 '_i24',
 '_i25',
 '_i26',
 '_i27',
 '_i28',
 '_i29',
 '_i3',
 '_i30',
 '_i31',
 '_i32',
 '_i33',
 '_i34',
 '_i35',
 '_i36',
 '_i37',
 '_i38',
 '_i39',
 '_i4',
 '_i40',
 '_i41',
 '_i42',
 '_i43',
 '_i44',
 '_i45',
 '_i46',
 '_i47',
 '_i48',
 '_i49',
 '_i5',
 '_i50',
 '_i51',
 '_i52',
 '_i53',
 '_i54',
 '_i55',
 '_i56',
 '_i57',
 '_i58',
 '_i59',
 '_i6',
 '_i60',
 '_i61',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'assig