# Organisational Reputation and ML

## Constants and preparations

### Constants

In [1]:
import os

LABELS_JSON_FILENAME = "labels2.json"
# DOCUMENTS_DOCX_FOLDER = 'Training Set for Hugo'
DOCUMENTS_DOCX_FOLDER = '/home/hugo/MEGA/work/Agency_Classification/Teaching_sample_2019_34_agencies/Teaching sample 2019 34 agencies'
TEXTFILES_FOLDER = os.path.join(os.getcwd(), 'documents_txt')
RAW_DATA_JSON_FILE = 'raw_data.json'
PREPROCESSED_JSON_FILE = 'raw_data.json'

### Preparations

In [2]:
if not os.path.exists(TEXTFILES_FOLDER):
    os.makedirs(TEXTFILES_FOLDER)

## Read the labels

In [3]:
!ls

 agency_counts.csv			 label_agency_heatmap.png
 agency_label_crosstab.csv		 label_counts.csv
 agency_label_crosstab_percentages.csv	 labels2.json
 all_data.json				 labels.csv
 analyze.ipynb				 labels.json
'Coding for Hugo.xlsx'			 Machine_Learning.ipynb
 confidence_pivot.csv			 make_new_labels_file.ipynb
 confidence_pivot_only_agencies.csv	 pycaret.ipynb
 confidence_pivot_only_labels.csv	 raw_data.json
 data_with_labels.csv			 regulatory_vs_non_regulatory.csv
 documents_txt				 regulatory_vs_non_regulatory.png
 dpc2vec.ipynb				 results
 evaluation_pivot.csv			 SVM_trained.pk
 evaluation_pivot_only_agencies.csv	'Training Set for Hugo'
 evaluation_pivot_only_labels.csv	 vectorizer.pk
 experiments_results			 year_label_heatmap.png


In [4]:
from csv import DictReader

def read_labels(filename:str = "labels.csv"):
    data = []

    with open(filename, 'rt') as f:
        reader = DictReader(f, delimiter='\t')
        for line in reader:
            line['Dimension 1st'] = line['Dimension 1']
            line['Dimension 2nd'] = line['Dimension 2']
            
            del line['Dimension 1']
            del line['Dimension 2']
            
            data.append(line)
            
    print(f"{len(data)} lines loaded")
    return data
            
def process_labels(labels):
    for line in labels:
        if not line['Document'].endswith(".docx"):
            line["Document"] = line["Document"].strip() + ".docx"
        
        line['Dimension 1st'] = str(int(line['Dimension 1st']))
        line['Dimension 2nd'] = str(int(line['Dimension 2nd']))
    

In [5]:
label_names = {
    "1": "Technical",
    "2": "Performative",
    "3": "Procedural",
    "4": "Moral"
}

In [6]:
import os

labels_file = "labels_FINAL.csv"
labels_folder = "/home/hugo/MEGA/work/Agency_Classification"
labels_file_path = os.path.join(labels_folder, labels_file)
assert os.path.exists(labels_file_path)

In [7]:
import json

labels = read_labels(labels_file_path)
labels

1302 lines loaded


[OrderedDict([('Document', 'ACER 2019 1.docx'),
              ('Dimension 1st', '1'),
              ('Dimension 2nd', '1')]),
 OrderedDict([('Document', 'ACER 2019 2.docx'),
              ('Dimension 1st', '1'),
              ('Dimension 2nd', '2')]),
 OrderedDict([('Document', 'ACER 2019 3.docx'),
              ('Dimension 1st', '2'),
              ('Dimension 2nd', '3')]),
 OrderedDict([('Document', 'ACER 2019 4.docx'),
              ('Dimension 1st', '1'),
              ('Dimension 2nd', '1')]),
 OrderedDict([('Document', 'ACER 2019 5.docx'),
              ('Dimension 1st', '3'),
              ('Dimension 2nd', '3')]),
 OrderedDict([('Document', 'ACER 2019 6.docx'),
              ('Dimension 1st', '3'),
              ('Dimension 2nd', '3')]),
 OrderedDict([('Document', 'ACER 2019 7.docx'),
              ('Dimension 1st', '2'),
              ('Dimension 2nd', '1')]),
 OrderedDict([('Document', 'ACER 2019 8.docx'),
              ('Dimension 1st', '2'),
              ('Dimension 2nd', 

In [8]:
import json

labels = read_labels(labels_file_path)

process_labels(labels)
with open(LABELS_JSON_FILENAME, 'wt') as out:
    json.dump(labels, out)

    assert os.path.exists(LABELS_JSON_FILENAME), "The file must be created."
!ls

1302 lines loaded
 agency_counts.csv			 label_agency_heatmap.png
 agency_label_crosstab.csv		 label_counts.csv
 agency_label_crosstab_percentages.csv	 labels2.json
 all_data.json				 labels.csv
 analyze.ipynb				 labels.json
'Coding for Hugo.xlsx'			 Machine_Learning.ipynb
 confidence_pivot.csv			 make_new_labels_file.ipynb
 confidence_pivot_only_agencies.csv	 pycaret.ipynb
 confidence_pivot_only_labels.csv	 raw_data.json
 data_with_labels.csv			 regulatory_vs_non_regulatory.csv
 documents_txt				 regulatory_vs_non_regulatory.png
 dpc2vec.ipynb				 results
 evaluation_pivot.csv			 SVM_trained.pk
 evaluation_pivot_only_agencies.csv	'Training Set for Hugo'
 evaluation_pivot_only_labels.csv	 vectorizer.pk
 experiments_results			 year_label_heatmap.png


In [9]:
import json

with open(LABELS_JSON_FILENAME, 'rt') as f:
    labels = json.load(f)
    
print(len(labels))
print(labels[:5])

1302
[{'Document': 'ACER 2019 1.docx', 'Dimension 1st': '1', 'Dimension 2nd': '1'}, {'Document': 'ACER 2019 2.docx', 'Dimension 1st': '1', 'Dimension 2nd': '2'}, {'Document': 'ACER 2019 3.docx', 'Dimension 1st': '2', 'Dimension 2nd': '3'}, {'Document': 'ACER 2019 4.docx', 'Dimension 1st': '1', 'Dimension 2nd': '1'}, {'Document': 'ACER 2019 5.docx', 'Dimension 1st': '3', 'Dimension 2nd': '3'}]


## Convert texts to txt

In [10]:
import docx
from tqdm import tqdm_notebook

documents_docx_list = [os.path.join(DOCUMENTS_DOCX_FOLDER, filename) for filename in os.listdir(DOCUMENTS_DOCX_FOLDER)]

def getText(filename):
    """
    Extract text from a docx file
    """
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def create_textfilename(docx_file_name):
    """
    Generate a filename of the txt file based on the filename of the docx file
    """
    pathlist = list(os.path.split(docx_file_name))
    pathlist[-2] = TEXTFILES_FOLDER
    newpath = os.path.join(*pathlist)
    filename, file_extension = os.path.splitext(newpath)
    newfilenm = filename + '.txt'
    return newfilenm

def read_texts():
    """
    Read the text from all docx files
    Returns a dict with keys = docx filenames and values are texts
    Also stores all texts as .txt
    """
    texts = {}
    for filename in tqdm_notebook(documents_docx_list):
        if '~$' in filename:
            continue
        text = getText(filename)
        textfilenm = create_textfilename(filename)
        filenm_short = os.path.splitext(os.path.split(textfilenm)[-1])[0]
        texts[filenm_short] = text
        
        with open(textfilenm, 'wt') as out:
            out.write(text)
        
    return texts
        
texts = read_texts()        
print(len(texts.keys()))
print(list(texts.items())[:3])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1302 [00:00<?, ?it/s]

1302
[('ESMA 2019 93', 'ESMA CONSULTS ON THE COMMERCIAL TERMS FOR PROVIDING CLIENT CLEARING SERVICES UNDER EMIR\n03 October 2019\xa0\nPOST TRADING\nThe European Securities and Markets Authority (ESMA), the EU’s securities markets regulator, has today published a\xa0\xa0(CP) on draft technical advice to the European Commission (EC) on specifying the conditions under which commercial terms are to be considered fair, reasonable, non-discriminatory and transparent (FRANDT) where clearing service providers offer clearing services to clients.\nDuring the implementation of EMIR’s clearing obligation, several counterparties have experienced issues around access to clearing. In response to this access issue EMIR Refit has introduced a number of measures to address it, including the FRANDT requirements.\nThe aim of FRANDT for client clearing services\nThe FRANDT requirements are part of a broader set of regulatory efforts to enhance access to clearing. The proposal on how to specify the conditio

## Combine text and labels

In [11]:
from tqdm import tqdm_notebook

for label_line in tqdm_notebook(labels):
    document_name = label_line["Document"]
    document_id = os.path.splitext(document_name)[0]
    label_line['id'] = document_id
    
    text = texts[document_id]
    label_line['text'] = text

raw_data = {line['id']: line for line in labels}
with open(RAW_DATA_JSON_FILE, 'wt') as out:
    json.dump(raw_data, out)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/1302 [00:00<?, ?it/s]

## Preprocess Texts

In [12]:
institutions = set()
for key in raw_data.keys():
    institution = key.split(' ')[0].lower()
    institutions.add(institution)
    
print(institutions)
#TODO: add more institutions

{'cdt', 'emsa', 'eit', 'eurofound', 'efsa', 'eba', 'easo', 'eige', 'euipo', 'efca', 'etf', 'acer', 'easa', 'fra', 'cvpo', 'emcdda', 'ema', 'cepol', 'frontex', 'eu-lisa', 'sbr', 'eu-osha', 'echa', 'eea', 'esma', 'eiopa', 'era', 'ecdc', 'europol', 'eurojust', 'enisa', 'eda', 'cedefop', 'berec'}


In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
stopWords = set(stopwords.words('english'))


remove_digits = str.maketrans(' ', ' ', string.digits)
remove_punct = str.maketrans(' ', ' ', string.punctuation)
# remove_stopwords = str.maketrans(' ', ' ', stopWords)



lemmatizer = WordNetLemmatizer()

def process_word(word:str):
    word = word.lower()
    word = lemmatizer.lemmatize(word)
    
    return word

# def remove_word(word):
#     if word in stopWords:
#         return True
#     elif word.startswith('\\u'):
#         return True
#     elif word in "!@#$%*()_+-=?/<,>.":
#         return True
#     return False

def preprocess_text(text):
    text = text.lower()
    text = text.translate(remove_punct)
    text = text.translate(remove_digits)
#     text = text.translate(remove_stopwords)
    text = re.sub(r'\u200b', ' ', text)
    text = re.sub(r'\n', ' ', text)
    wordlist = word_tokenize(text)
    wordlist = [lemmatizer.lemmatize(word) for word in wordlist if not (word in stopWords or word in institutions)]
    
    processed_text = ' '.join(wordlist)
    
    return processed_text
#     input(wordlist)
    


In [14]:
for identifier, data_line in tqdm_notebook(raw_data.items()):
    text = data_line['text']
    processed_text = preprocess_text(text)
    raw_data[identifier]['processed_text']=processed_text
    
print(list(raw_data.items())[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/1302 [00:00<?, ?it/s]

('ACER 2019 1', {'Document': 'ACER 2019 1.docx', 'Dimension 1st': '1', 'Dimension 2nd': '1', 'id': 'ACER 2019 1', 'text': "ACER analyses national methodologies for electricity transmission tariffs in the EU\n\n23/12/2019\nThe European Union Agency for the Cooperation of Energy Regulators (ACER) released today a report reviewing the status of methodologies for electricity transmission tariffs in the European Union Member States. \n\nThis report\u200b is a contribution towards the Clean Energy Package objective to increase transparency and comparability in tariff-setting. The report also presents the findings of ACER's monitoring of transmission charges paid by producers. The status review of the distribution tariff structures and the identification of best tariff practices will be subject to future ACER reports. Find below the main findings in different areas.\n\nMethodology setting\n\nIn all but three jurisdictions/countries the corresponding National Regulatory Authority (NRA) directl

In [15]:
import json

with open(PREPROCESSED_JSON_FILE, 'wt') as out:
    json.dump(raw_data, out)
    assert os.path.exists(PREPROCESSED_JSON_FILE), "Json file must be created."

In [16]:
!ls

 agency_counts.csv			 label_agency_heatmap.png
 agency_label_crosstab.csv		 label_counts.csv
 agency_label_crosstab_percentages.csv	 labels2.json
 all_data.json				 labels.csv
 analyze.ipynb				 labels.json
'Coding for Hugo.xlsx'			 Machine_Learning.ipynb
 confidence_pivot.csv			 make_new_labels_file.ipynb
 confidence_pivot_only_agencies.csv	 pycaret.ipynb
 confidence_pivot_only_labels.csv	 raw_data.json
 data_with_labels.csv			 regulatory_vs_non_regulatory.csv
 documents_txt				 regulatory_vs_non_regulatory.png
 dpc2vec.ipynb				 results
 evaluation_pivot.csv			 SVM_trained.pk
 evaluation_pivot_only_agencies.csv	'Training Set for Hugo'
 evaluation_pivot_only_labels.csv	 vectorizer.pk
 experiments_results			 year_label_heatmap.png


## Make DTM

In [17]:
import numpy as np
texts = []
y_1 = []
y_2 = []

for identifier, data_line in tqdm_notebook(raw_data.items()):
    texts.append(data_line.get('processed_text'))
    y_1.append(data_line.get("Dimension 1st"))
    y_2.append(data_line.get("Dimension 2nd"))

y_1 = np.array(y_1, dtype = np.int16)
y_2 = np.array(y_2, dtype = np.int16)
texts = np.array(texts)

y1_all = y_1


y_1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/1302 [00:00<?, ?it/s]

array([1, 1, 2, ..., 3, 1, 2], dtype=int16)

In [18]:
from collections import Counter

In [19]:
count = Counter(y_1)
count

Counter({1: 328, 2: 547, 3: 302, 4: 125})

In [20]:
np.array(list(count.values()))/sum(list(count.values()))

array([0.25192012, 0.42012289, 0.23195084, 0.09600614])

#### Shuffle

In [21]:
# np.random.seed(1)
# random_array = np.random.rand(len(texts))

# indices = np.argsort(random_array)

# texts_ = texts[indices]
# y_1 = y_1[indices]
# y_2 = y_2[indices]
# print(len(texts))

### Separate Train-test

In [22]:
from sklearn.model_selection import train_test_split

texts_train, texts_test, y1_train, y1_test, y2_train, y2_test = train_test_split(texts, y_1, y_2, train_size=0.90, random_state=42)

assert len(texts_train) == len(y1_train) == len(y2_train)
assert len(texts_test) == len(y1_test) == len(y2_test)


In [23]:
y_1

array([1, 1, 2, ..., 3, 1, 2], dtype=int16)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf = True)
X_train = vectorizer.fit_transform(texts_train)

print(X_train.shape)

(1171, 14744)


#### Store vectorizer

In [25]:
import pickle

with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(vectorizer, fin)

In [26]:
ls

 agency_counts.csv                       [0m[01;35mlabel_agency_heatmap.png[0m
 agency_label_crosstab.csv               label_counts.csv
 agency_label_crosstab_percentages.csv   labels2.json
 all_data.json                           labels.csv
 analyze.ipynb                           labels.json
'Coding for Hugo.xlsx'                   Machine_Learning.ipynb
 confidence_pivot.csv                    make_new_labels_file.ipynb
 confidence_pivot_only_agencies.csv      pycaret.ipynb
 confidence_pivot_only_labels.csv        raw_data.json
 data_with_labels.csv                    regulatory_vs_non_regulatory.csv
 [01;34mdocuments_txt[0m/                          [01;35mregulatory_vs_non_regulatory.png[0m
 dpc2vec.ipynb                           [01;34mresults[0m/
 evaluation_pivot.csv                    SVM_trained.pk
 evaluation_pivot_only_agencies.csv     [01;34m'Training Set for Hugo'[0m/
 evaluation_pivot_only_labels.csv        vectorizer.pk
 [01;34mexperiments_r

## Test different models

Experiments:

* SVM 1st dim and SVM 2nd dim
* randm forest 1st dim and 2nd dim
* logreg 1st and 2nd
* multilabel classification

In [27]:
display_cols_svm = ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_kernel', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score', 'std_train_score']

results_folder = "experiments_results"

In [28]:
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [29]:
def make_results_filename(classifier, dim, comment = ""):
    return f"{results_folder}/{classifier}_{dim}_{comment}.csv"

In [30]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

X_test = vectorizer.transform(texts_test)

def evaluate_on_test(X_train, X_test, y_train, y_test, best_params, algorithm):
    print(best_params)
    print("-"*80, end = '\n\n')
    
    final_model = algorithm(**best_params)
    final_model.fit(X= X_train, y = y_train)
    y_pred = final_model.predict(X_test)
    
    print(classification_report(y_test, y_pred))
    print("-"*80,end = '\n\n')
    print(confusion_matrix(y1_test, y_pred))



### SVM

#### Define experiment

In [31]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

def svc_experiment(X,y):
    svc = SVC()
    parameters = {'kernel':('linear', 'rbf'), 'C':[10, 100, 1000, 1e4, 1e5, 1e6]} #0.1, 1, 10,

    clf = GridSearchCV(svc, parameters, n_jobs=3, scoring='precision_weighted', cv=10, verbose=10, return_train_score=True)
    clf.fit(X, y)
    
    svm_results = clf.cv_results_
    
    return clf, svm_results


#### 1st dim

In [32]:
svm_y1_cv_model, svm_y1_results_json = svc_experiment(X= X_train, y = y1_train)

svm_y1_results = pd.DataFrame(svm_y1_results_json)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    7.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   19.6s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   28.5s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   51.8s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  3.3min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  3.8min
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed:  4.2min finished


In [33]:
svm_y1_results[display_cols_svm].sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,3.772714,0.340485,0.262736,0.04773,10.0,linear,"{'C': 10, 'kernel': 'linear'}",0.712857,0.031661,1,0.994698,0.001213
3,3.752733,0.129603,0.271445,0.051081,100.0,rbf,"{'C': 100, 'kernel': 'rbf'}",0.70967,0.038082,2,0.999337,0.000434
1,4.904482,0.753127,0.349438,0.131618,10.0,rbf,"{'C': 10, 'kernel': 'rbf'}",0.709237,0.03145,3,0.997544,0.000463
5,3.820874,0.171119,0.24682,0.033724,1000.0,rbf,"{'C': 1000, 'kernel': 'rbf'}",0.70874,0.039833,4,1.0,0.0
7,3.643537,0.212249,0.223371,0.023101,10000.0,rbf,"{'C': 10000.0, 'kernel': 'rbf'}",0.70874,0.039833,4,1.0,0.0
9,3.763347,0.101259,0.278425,0.052246,100000.0,rbf,"{'C': 100000.0, 'kernel': 'rbf'}",0.70874,0.039833,4,1.0,0.0
11,3.586352,0.206843,0.233183,0.016701,1000000.0,rbf,"{'C': 1000000.0, 'kernel': 'rbf'}",0.70874,0.039833,4,1.0,0.0
2,4.520019,0.724525,0.270387,0.055505,100.0,linear,"{'C': 100, 'kernel': 'linear'}",0.695526,0.030588,8,0.999243,0.000379
4,3.538601,0.09937,0.23367,0.025096,1000.0,linear,"{'C': 1000, 'kernel': 'linear'}",0.680395,0.019466,9,1.0,0.0
6,3.521207,0.161984,0.24943,0.041298,10000.0,linear,"{'C': 10000.0, 'kernel': 'linear'}",0.680395,0.019466,9,1.0,0.0


In [34]:
outfile = make_results_filename('svm', 1)
print(outfile)

svm_y1_results.to_csv(outfile, sep='\t')

!ls experiments_results

experiments_results/svm_1_.csv
LR_1_.csv  LR_2_.csv  RF_1_.csv  RF_2_.csv  svm_1_.csv	svm_2_.csv


In [35]:
svm_y1_cv_model.best_params_

{'C': 10, 'kernel': 'linear'}

In [36]:


# X_test = vectorizer.transform(texts_test)

# cv_model.best_params_

# final_model = SVC(C = cv_model.best_params_['C'], kernel = cv_model.best_params_['kernel'])
# final_model.fit(X= X_train, y = y1_train)
# y1_pred = final_model.predict(X_test)




In [37]:
# from sklearn.metrics import classification_report
# print(classification_report(y1_test, y1_pred))

In [38]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(y1_test, y1_pred)

In [39]:
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix

# def evaluate_on_test(X_train, X_test, y_train, y_test, best_params):
#     print(best_params)
#     print("-"*80, end = '\n\n')
    
#     final_model = SVC(**best_params)
#     final_model.fit(X= X_train, y = y_train)
#     y_pred = final_model.predict(X_test)
    
#     print(classification_report(y_test, y_pred))
#     print("-"*80,end = '\n\n')
#     print(confusion_matrix(y1_test, y1_pred))



In [40]:
evaluate_on_test(X_train, X_test, y1_train, y1_test, svm_y1_cv_model.best_params_, algorithm=SVC)

{'C': 10, 'kernel': 'linear'}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           1       0.78      0.85      0.82        34
           2       0.70      0.81      0.75        52
           3       0.81      0.60      0.69        35
           4       0.62      0.50      0.56        10

    accuracy                           0.74       131
   macro avg       0.73      0.69      0.70       131
weighted avg       0.74      0.74      0.74       131

--------------------------------------------------------------------------------

[[29  3  1  1]
 [ 5 42  4  1]
 [ 2 11 21  1]
 [ 1  4  0  5]]


### Train and store the svm 

In [41]:
X_all = vectorizer.transform(texts)
X_all.shape

(1302, 14744)

In [42]:
y1_all.shape

(1302,)

In [43]:
from sklearn.svm import SVC

svc = SVC(probability=True, **svm_y1_cv_model.best_params_)
svc.fit(X_all, y1_all)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [44]:
import pickle
with open("SVM_trained.pk", "wb") as out:
    pickle.dump(svc, out)

In [45]:
ls

 agency_counts.csv                       [0m[01;35mlabel_agency_heatmap.png[0m
 agency_label_crosstab.csv               label_counts.csv
 agency_label_crosstab_percentages.csv   labels2.json
 all_data.json                           labels.csv
 analyze.ipynb                           labels.json
'Coding for Hugo.xlsx'                   Machine_Learning.ipynb
 confidence_pivot.csv                    make_new_labels_file.ipynb
 confidence_pivot_only_agencies.csv      pycaret.ipynb
 confidence_pivot_only_labels.csv        raw_data.json
 data_with_labels.csv                    regulatory_vs_non_regulatory.csv
 [01;34mdocuments_txt[0m/                          [01;35mregulatory_vs_non_regulatory.png[0m
 dpc2vec.ipynb                           [01;34mresults[0m/
 evaluation_pivot.csv                    SVM_trained.pk
 evaluation_pivot_only_agencies.csv     [01;34m'Training Set for Hugo'[0m/
 evaluation_pivot_only_labels.csv        vectorizer.pk
 [01;34mexperiments_r

#### 2nd dim

In [46]:
svm_y2_cv_model, svm_y2_results_json = svc_experiment(X_train, y2_train)
svm_y2_results = pd.DataFrame(svm_y2_results_json)


Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    5.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   16.8s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   23.3s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   40.4s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   51.7s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  2.6min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  3.4min
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed:  3.8min finished


In [47]:
svm_y2_results[display_cols_svm]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,3.475412,0.207652,0.228845,0.018839,10.0,linear,"{'C': 10, 'kernel': 'linear'}",0.462075,0.032599,7,0.990263,0.001338
1,3.646675,0.261539,0.233317,0.027422,10.0,rbf,"{'C': 10, 'kernel': 'rbf'}",0.491324,0.042698,1,0.996025,0.001018
2,3.268432,0.102768,0.229195,0.01777,100.0,linear,"{'C': 100, 'kernel': 'linear'}",0.456692,0.031812,8,0.997916,0.000569
3,3.469876,0.138782,0.239674,0.028013,100.0,rbf,"{'C': 100, 'kernel': 'rbf'}",0.481956,0.044471,2,0.999243,0.000378
4,3.297794,0.122908,0.216316,0.013126,1000.0,linear,"{'C': 1000, 'kernel': 'linear'}",0.447109,0.034886,9,1.0,0.0
5,3.484751,0.122626,0.238524,0.020667,1000.0,rbf,"{'C': 1000, 'kernel': 'rbf'}",0.481255,0.042433,3,1.0,0.0
6,3.332941,0.250279,0.224217,0.020084,10000.0,linear,"{'C': 10000.0, 'kernel': 'linear'}",0.447109,0.034886,9,1.0,0.0
7,3.554806,0.232158,0.240781,0.036928,10000.0,rbf,"{'C': 10000.0, 'kernel': 'rbf'}",0.481255,0.042433,3,1.0,0.0
8,3.305616,0.089385,0.227324,0.024552,100000.0,linear,"{'C': 100000.0, 'kernel': 'linear'}",0.447109,0.034886,9,1.0,0.0
9,3.42113,0.158349,0.230041,0.017818,100000.0,rbf,"{'C': 100000.0, 'kernel': 'rbf'}",0.481255,0.042433,3,1.0,0.0


In [48]:
outfile = make_results_filename('svm', 2)
print(outfile)

svm_y2_results.to_csv(outfile, sep='\t')

!ls experiments_results

experiments_results/svm_2_.csv
LR_1_.csv  LR_2_.csv  RF_1_.csv  RF_2_.csv  svm_1_.csv	svm_2_.csv


In [49]:
evaluate_on_test(X_train, X_test, y2_train, y2_test, svm_y2_cv_model.best_params_, algorithm=SVC)

{'C': 10, 'kernel': 'rbf'}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           1       0.50      0.50      0.50        28
           2       0.42      0.57      0.48        44
           3       0.64      0.64      0.64        42
           4       0.00      0.00      0.00        17

    accuracy                           0.50       131
   macro avg       0.39      0.43      0.41       131
weighted avg       0.45      0.50      0.47       131

--------------------------------------------------------------------------------

[[22  8  4  0]
 [ 2 32 18  0]
 [ 3 12 20  0]
 [ 1  8  0  1]]


### Random Forest

In [50]:
display_cols_RF = ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_estimators', 'param_max_depth', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score', 'std_train_score']


#### Define Experiment

In [51]:
from sklearn.ensemble import RandomForestClassifier

def RF_experiment(X,y):
    rf = RandomForestClassifier()
    parameters = {'n_estimators':[10,20,50, 100, 120, 150, 200, 250, 300, 1000], 'max_depth': [10, 20, 50, None]}

    clf = GridSearchCV(rf, parameters, n_jobs=3, scoring='f1_weighted', cv=10, verbose=10, return_train_score=True)
    clf.fit(X, y)
    
    svm_results = pd.DataFrame(clf.cv_results_)
    
    return clf, svm_results

#### 1st dim

In [52]:
RF_y1_cv_model, RF_y1_results = RF_experiment(X_train, y1_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    5.2s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    8.7s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   13.8s


KeyboardInterrupt: 

In [None]:
RF_y1_results[display_cols_RF]

In [None]:
outfile = make_results_filename('RF', 1)
print(outfile)

RF_y1_results.to_csv(outfile, sep='\t')

!ls experiments_results

In [None]:
RF_y1_cv_model.best_params_

In [None]:
evaluate_on_test(X_train, X_test, y1_train, y1_test, RF_y1_cv_model.best_params_, algorithm=RandomForestClassifier)

In [None]:
# RF_y1_cv_model.best_params_

In [None]:

# X_test = vectorizer.transform(texts_test)

# RF_y1_cv_model.best_params_

# RF_y1_final_model = RandomForestClassifier(
#     max_depth = RF_y1_cv_model.best_params_['max_depth'], 
#     n_estimators = RF_y1_cv_model.best_params_['n_estimators'])

# RF_y1_final_model.fit(X= X_train, y = y1_train)
# RF_y1_pred = RF_y1_final_model.predict(X_test)


In [None]:
# from sklearn.metrics import classification_report
# print(classification_report(y1_test, RF_y1_pred))

In [None]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(y1_test, RF_y1_pred)

#### 2nd dim

In [None]:
RF_y2_cv_model, RF_y2_results = RF_experiment(X_train, y2_train)

In [None]:
RF_y2_results[display_cols_RF]

In [None]:
outfile = make_results_filename('RF', 2)
print(outfile)

RF_y2_results.to_csv(outfile, sep='\t')

!ls experiments_results

In [None]:
RF_y1_cv_model.best_params_

In [None]:
evaluate_on_test(
    X_train, 
    X_test, 
    y2_train, 
    y2_test, 
    RF_y2_cv_model.best_params_,
    algorithm= RandomForestClassifier)

In [None]:

# X_test = vectorizer.transform(texts_test)

# RF_y2_cv_model.best_params_

# RF_y2_final_model = RandomForestClassifier(
#     max_depth = RF_y2_cv_model.best_params_['max_depth'], 
#     n_estimators = RF_y2_cv_model.best_params_['n_estimators'])

# RF_y2_final_model.fit(X= X_train, y = y2_train)
# RF_y2_pred = RF_y2_final_model.predict(X_test)


In [None]:
# from sklearn.metrics import classification_report
# print(classification_report(y2_test, RF_y2_pred))

In [None]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(y2_test, RF_y2_pred)

### Logistic Regression

In [None]:
display_cols_LR = ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_penalty', 'param_C', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score', 'std_train_score']


#### Define Experiment

In [None]:
from sklearn.linear_model import LogisticRegression

def RF_experiment(X,y):
    lr = LogisticRegression()
    parameters = {'penalty':['l1', 'l2'], 'C':[0.1, 1, 10, 20, 100, 200, 1000, 2000, 1e4, 1e5]}

    clf = GridSearchCV(lr, parameters, n_jobs=3, scoring='f1_weighted', cv=10, verbose=10, return_train_score=True)
    clf.fit(X, y)
    
    svm_results = pd.DataFrame(clf.cv_results_)
    
    return clf, svm_results

#### 1st dim

In [None]:
LR_y1_cv_model, LR_y1_results = RF_experiment(X_train, y1_train)

In [None]:
LR_y1_results[display_cols_LR]

In [None]:
outfile = make_results_filename('LR', 1)
print(outfile)

LR_y1_results.to_csv(outfile, sep='\t')

!ls experiments_results

In [None]:
evaluate_on_test(
    X_train, 
    X_test, 
    y1_train, 
    y1_test, 
    LR_y1_cv_model.best_params_,
    algorithm= LogisticRegression)

In [None]:


# X_test = vectorizer.transform(texts_test)

# LR_cv_model.best_params_

# final_model = LR(C = cv_model.best_params_['C'], kernel = cv_model.best_params_['kernel'])
# final_model.fit(X= X_train, y = y1_train)
# y1_pred = final_model.predict(X_test)



#### 2nd dim

In [None]:
LR_y2_cv_model, LR_y2_results = RF_experiment(X_train, y2_train)

In [None]:
LR_y2_results[display_cols_LR]

In [None]:
outfile = make_results_filename('LR', 2)
print(outfile)

LR_y1_results.to_csv(outfile, sep='\t')

!ls experiments_results

In [None]:
evaluate_on_test(
    X_train, 
    X_test, 
    y2_train, 
    y2_test, 
    LR_y2_cv_model.best_params_,
    algorithm= LogisticRegression)

## Label analysis

In this section I will explore how often a model trained on the first demention predicts a label from the second demention. But first I will explore if labels "correlate".

In [None]:
y_1

In [None]:
y_2

In [None]:
from collections import Counter

In [None]:
Counter(y_1)

In [None]:
Counter(y_2)

In [None]:
conf_matrix = np.zeros((4,4), dtype = np.int16)
conf_matrix

In [None]:
for first, second in zip(y_1, y_2):
    conf_matrix[first-1, second-1] += 1
    
conf_matrix

In [None]:
from collections import defaultdict

d = defaultdict(lambda :0)

for first, second in zip(y_1, y_2):
    d[f"{first}_{second}"] += 1


sorted(d.items(), key=lambda x:x[0])

### Analyze different classifiers

First write the evaluation function

In [None]:
from collections import defaultdict


def evaluate_together(y_1_true, y_2_true, y_1_pred):
    assert len(y_1_true) == len(y_2_true) == len(y_1_pred)
#     results = defaultdict(lambda :defaultdict(lambda : ))
    results = {
        "first_degree": 0,
        "second_degree": 0,
        "false": 0
    }
    total = 0
    for true_1, true_2, pred in zip(y_1_true, y_2_true, y_1_pred):
        if pred == true_1:
            results["first_degree"] += 1
        elif pred == true_2:
            results["second_degree"] += 1
        else:
            results["false"] += 1
            
        total += 1
        
    assert total == len(y_1_true) == len(y_2_true) == len(y_1_pred) == total
    
    return results, total
    
def return_defdict():
    return defaultdict(int)
    
def evaluate_together2(y_1_true, y_2_true, y_1_pred):
    assert len(y_1_true) == len(y_2_true) == len(y_1_pred)
    results = defaultdict(return_defdict)
#     results = {
#         "first_degree": 0,
#         "second_degree": 0,
#         "false": 0
#     }
    total = 0
    for true_1, true_2, pred in zip(y_1_true, y_2_true, y_1_pred):
        if pred == true_1:
            results[true_1]["first_degree"] += 1
        elif pred == true_2:
            results[true_1]["second_degree"] += 1
        else:
            results[true_1]["false"] += 1
            
        total += 1
        
    assert total == len(y_1_true) == len(y_2_true) == len(y_1_pred) == total
    
    return results, total
        

#### SVM

In [None]:
Counter(y1_test)

In [None]:
svc = SVC(**svm_y1_cv_model.best_params_)
svc.fit(X_train, y1_train)

y_pred = svc.predict(X_test)

results, total = evaluate_together(y1_test, y2_test, y_pred)
print(results)
print('2md degree accuracy:\t', (results['first_degree'] + results['second_degree'])/total)

In [None]:

results2, total2 = evaluate_together2(y1_test, y2_test, y_pred)
results2

In [None]:
for label, res in sorted(results2.items(), key= lambda x:x[0]):
    fd = res.get('first_degree',0)
    sd = res.get('second_degree', 0)
    errors = res.get('false',0)
    
#     print(fd,sd,errors)
    first_degree_precision = fd / (fd+sd+errors)
    second_degree_precision = (fd + sd)/ (fd + sd + errors)
    print("-",label)
    print('\tfirst degree:\t', first_degree_precision)
    print('\tsecond degree:\t', second_degree_precision)
#     print(label, res)

#### RF

In [None]:
rf = RandomForestClassifier(**RF_y1_cv_model.best_params_)
rf.fit(X_train, y1_train)

y_pred = rf.predict(X_test)

results, total = evaluate_together(y1_test, y2_test, y_pred)
print(results)
print('2md degree accuracy:\t', (results['first_degree'] + results['second_degree'])/total)

print('-'*50)

results2, total2 = evaluate_together2(y1_test, y2_test, y_pred)
print(results2)

## SMOTE

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

X_test = vectorizer.transform(texts_test)

def evaluate_on_test_SMOTE(X_train, X_test, y_train, y_test, best_params, algorithm):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    print(best_params)
    print("-"*80, end = '\n\n')
    
    final_model = algorithm(**best_params)
    final_model.fit(X= X_res, y = y_res)
    y_pred = final_model.predict(X_test)
    
    print(classification_report(y_test, y_pred))
    print("-"*80,end = '\n\n')
    print(confusion_matrix(y1_test, y_pred))
    
    

In [None]:



grid = GridSearchCV(model, params, ...)
grid.fit(X, y)

In [None]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline


def svc_SMOTE_experiment(X,y):
        
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    
    model = Pipeline([
        ('sampling', SMOTE()),
        ('classification', SVC())
    ])
    

    parameters = {'classification__kernel':('linear', 'rbf'), 'classification__C':[100, 1000, 1e4, 1e5, 1e6]} #0.1, 1, 10,
    
    clf = GridSearchCV(model, parameters, n_jobs=3, scoring='f1_weighted', cv=10, verbose=10, return_train_score=True)
    clf.fit(X_res, y_res)
    
    svm_results = clf.cv_results_
    
    return clf, svm_results


In [None]:
svm_y1_cv_model, svm_y1_results_json = svc_SMOTE_experiment(X= X_train, y = y1_train)

svm_y1_results = pd.DataFrame(svm_y1_results_json)

In [None]:
outfile = make_results_filename('svm', 1)
print(outfile)

svm_y1_results.to_csv(outfile, sep='\t')

In [None]:
svm_y1_results[display_cols_svm]

In [None]:
evaluate_on_test_SMOTE(X_train, X_test, y1_train, y1_test, svm_y1_cv_model.best_params_, algorithm=SVC)

## Doc2Vec



In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_tagged = [TaggedDocument(text,[label]) for text, label in zip(texts_train, y1_train)]
# train.apply(
#     lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)
test_tagged = [TaggedDocument(text,[label]) for text, label in zip(texts_test, y1_test)]
#                 test.apply(
#     lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() -1

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample = 0, workers=cores)


In [None]:
model_dbow.build_vocab([x for x in tqdm_notebook(train_tagged)])

In [None]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged)]), total_examples=len(train_tagged), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

## Multilabel Classification

