In [32]:
import sys
sys.path.append("..")

from services.segments_database import do_query, insert_trainingdata_from_chapter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, recall_score, roc_curve, precision_score
import scipy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import StratifiedKFold



## Config

In [33]:
number_of_categories = 40
entries_per_category = 50
entries_trainingdata = 50
current_round = 10
current_category = 'zk'
zk_category_id = 4
relevant_columns = ['header_preprocessed', 'parent_preprocessed', 'grandparent_preprocessed', 'words']
random_state = 0

# Functions

In [34]:
def print_tree(model, feature_names, filename): 
    print(model)
    export_graphviz(model, out_file='{}.dot'.format(filename), feature_names = feature_names, class_names = ["0","1"],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

## Get Trainingdata

In [35]:
query = '''
    SELECT *, 
    IF(length(GROUP_CONCAT(word.word SEPARATOR ' ')) > 0, GROUP_CONCAT(word.word SEPARATOR ' '), '')  as words 
    FROM trainingdata 
    LEFT JOIN word ON trainingdata.id = word.chapter_id
    WHERE (label = 'yes' 
    OR label = 'no')
    and round {0} {1}
    GROUP BY trainingdata.id
'''.format('<=', current_round)

In [36]:
res = do_query(query)
df = pd.DataFrame(res.fetchall())
df.columns = res.keys()
print(df.shape)

(934, 33)


In [37]:
df.head()

Unnamed: 0,id,text,chapter_idx,chapter_number,header,header_preprocessed,parent_header,parent_preprocessed,grandparent_header,grandparent_preprocessed,...,round,suggested_label,createdAt,updatedAt,id.1,word,tfidf,score,chapter_id,words
0,1418,\nDienstleistungsauftrag.\n,21,21,Auftragsart\n,auftragsart,,,,,...,1,0,2019-05-06 13:14:39,2019-05-10 11:58:56,23108,dienstleistungsauftrag,1.0,296.0,1418,dienstleistungsauftrag
1,1702,"\nAngebote sind einfach und vollständig, recht...",36,,Einreichung Angebot\n,einreichung angebot,,,,,...,2,0,2019-06-02 10:49:09,2019-06-02 12:50:03,27722,anbietende,0.101942,316.0,1702,anbietende anbieterin angabe angebot ausgefüll...
2,2267,"\nFreitag, 19. Januar 2018\n",0,,\n,,,,,,...,1,0,2019-05-06 13:14:38,2019-05-06 15:33:20,37781,19,0.463409,1020.0,2267,19 2018 freitag januar
3,2369,\n1 Referenz des Anbieters in den letzten 5 Ja...,41,,TECHNISCHE LEISTUNGSFÄHIGKEIT (Firmenreferenze...,technische leistungsfähigkeit firmenreferenzen,,,,,...,6,0,2019-06-02 12:13:33,2019-06-02 14:14:02,39346,1,0.211744,11969.0,2369,1 5 anbieters art aufgabe drei erfüllt jahr ko...
4,2388,\nDie Zuschlagskriterien werden mit Noten zwis...,49,9.3.5,Bewertung der Zuschlagskriterien\n,bewertung zuschlagskriterium,,,,,...,7,1,2019-06-02 12:27:51,2019-06-02 14:31:01,39740,0,0.248047,4432.0,2388,0 1 3 angabe ausreichenden beitrag bewerten be...


## Ensemble Class
source: https://github.com/msahamed/handle_imabalnce_class/blob/master/imbalance_datasets_machine_learning.ipynb

In [38]:
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y):
        X = np.array(X.toarray())
        y = np.array(y)
                
        train_pred = np.zeros((X.shape[0], len(self.base_models)))
        f1_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
        
        for i, clf in enumerate(self.base_models):
            
            folds = StratifiedKFold(n_splits=5, shuffle=True, random_state = random_state).split(X, y)
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                X_valid = X[valid_idx]
                Y_valid = y[valid_idx]
                
                clf.fit(X_train, Y_train)
                
                valid_pred = clf.predict(X_valid)
                recall  = recall_score(Y_valid, valid_pred, average='macro')
                f1 = f1_score(Y_valid, valid_pred, average='macro')
                
                recall_scores[i][j] = recall
                f1_scores[i][j] = f1
                
                train_pred[valid_idx, i] = valid_pred
                
                print( "Model- {} and CV- {} recall: {}, f1_score: {}".format(i, j, recall, f1))
            
        return train_pred, recall_scores, f1_scores

## Create Train Data and make TF-IDF

### create X

In [39]:
X_train = df[relevant_columns]
X_train.tail()

Unnamed: 0,header_preprocessed,parent_preprocessed,grandparent_preprocessed,words
929,111,,,1 2 a baustelleneinrichtung cd ef haus nr plan
930,prozessanforderung,,,0 0074 0158 abbildung ausgewählt beschreibung ...
931,angebotssprache,angebotsabgabe,administrative vorgabe,angebot beilage deutsch einreichen englisch fr...
932,objekt sporzentrum kerenzerberg,,,09 243 3 armaturen einbauten galerie heizgrupp...
933,montage befestigung heizungsdämmung,,,abzukleben aluminium befestigen d dichtigkeits...


### create y

In [40]:
# get df, not series
y_train = df.loc[:,['label']]
y_train.loc[y_train['label'] == 'yes'] = 1
y_train.loc[y_train['label'] == 'no'] = 0
# convert to series
y_train = y_train['label']
y_train[0:5]

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

### tf-idf

train data

In [41]:
words_vec = TfidfVectorizer()
words_tfidf = words_vec.fit_transform(X_train['words'])

header_vec = TfidfVectorizer()
header_tfidf = header_vec.fit_transform(X_train['header_preprocessed'])

parent_vec = TfidfVectorizer()
parent_tfidf = parent_vec.fit_transform(X_train['parent_preprocessed'])

grandparent_vec = TfidfVectorizer()
grandparent_tfidf = grandparent_vec.fit_transform(X_train['grandparent_preprocessed'])

X_train_union = scipy.sparse.hstack([
    words_tfidf,
    header_tfidf,
    parent_tfidf,
    grandparent_tfidf
])

X_train_features = words_vec.get_feature_names()\
               + header_vec.get_feature_names()\
               + parent_vec.get_feature_names()\
               + grandparent_vec.get_feature_names()


print(len(X_train_features))
print(X_train_union.shape) 

7656
(934, 7656)


## RandomForest Classifier

In [42]:
classifier0 = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced", max_depth=None)
classifier1 = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced", max_depth=15)
classifier2 = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced", max_depth=10)
classifier3 = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced", max_depth=5)
classifier4 = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced", max_depth=2)
classifier5 = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced", max_depth=1)

## Cross Validation

In [43]:
# base_models = [classifier0, classifier1, classifier2, classifier3, classifier4, classifier5]
base_models = [classifier2]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)        

In [44]:
y_train_pred, recall_scores, f1_scores = lgb_stack.predict(X_train_union, y_train)

Model- 0 and CV- 0 recall: 0.7143256464011181, f1_score: 0.7239759213037733
Model- 0 and CV- 1 recall: 0.7683047029005914, f1_score: 0.7868541033434651
Model- 0 and CV- 2 recall: 0.7551393973528584, f1_score: 0.772644376899696
Model- 0 and CV- 3 recall: 0.769804822043628, f1_score: 0.797459165154265
Model- 0 and CV- 4 recall: 0.6940298507462687, f1_score: 0.7060931899641576


## Cross Validation Report

In [45]:
df_cv = pd.DataFrame(y_train_pred)
for column in df_cv:
    print('1. The F-1 score of the model {}\n'.format(f1_score(y_train, df_cv[column], average='macro')))
    print('2. The recall score of the model {}\n'.format(recall_score(y_train, df_cv[column], average='macro')))
    print('3. Classification report \n {} \n'.format(classification_report(y_train, df_cv[column])))
    print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_train, df_cv[column]))) 
    print('===============')

1. The F-1 score of the model 0.7569688917732486

2. The recall score of the model 0.7403852147353986

3. Classification report 
               precision    recall  f1-score   support

           0       0.84      0.92      0.88       671
           1       0.73      0.56      0.64       263

   micro avg       0.82      0.82      0.82       934
   macro avg       0.79      0.74      0.76       934
weighted avg       0.81      0.82      0.81       934
 

4. Confusion matrix 
 [[616  55]
 [115 148]] 



In [46]:
recall_scores

array([[0.71432565, 0.7683047 , 0.7551394 , 0.76980482, 0.69402985]])

In [47]:
#print_tree(lgb_stack.base_models[0].estimators_[50], X_test_features, 'tree_0')
#print_tree(lgb_stack.base_models[1].estimators_[50], X_test_features, 'tree_1')

### print feature importances

In [48]:
for i, model in enumerate(lgb_stack.base_models): 
    print('model {}'.format(i))
    importances = model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
    indices = np.argsort(importances)[::-1]    
    
    for f in range(10):
        print("%d. feature: %s %d (%f)" % (f + 1, X_train_features[indices[f]], indices[f], importances[indices[f]]))
        
    print('===')

model 0
1. feature: zuschlagskriterium 7027 (0.048777)
2. feature: zk 7003 (0.020426)
3. feature: gewichtung 2607 (0.018157)
4. feature: zuschlagskriterium 5805 (0.015792)
5. feature: angebot 550 (0.015432)
6. feature: punkt 4147 (0.015337)
7. feature: zuschlagskriterium 7456 (0.014869)
8. feature: schlüsselperson 4504 (0.013951)
9. feature: qualität 4178 (0.012623)
10. feature: preis 3983 (0.012380)
===


## Select Model

In [49]:
final_model = lgb_stack.base_models[0]

# Evaluate model against 1000-Trainingdata

In [50]:
query = '''
    SELECT *, 
    IF(length(GROUP_CONCAT(word.word SEPARATOR ' ')) > 0, GROUP_CONCAT(word.word SEPARATOR ' '), '')  as words
    FROM trainingdata 
    LEFT JOIN word ON trainingdata.id = word.chapter_id
    WHERE (label = 'yes' 
    OR label = 'no') and round = 1000
    GROUP BY trainingdata.id
'''

In [51]:
res = do_query(query)
df = pd.DataFrame(res.fetchall())
df.columns = res.keys()
df.shape

(1000, 33)

In [52]:
df.head()

Unnamed: 0,id,text,chapter_idx,chapter_number,header,header_preprocessed,parent_header,parent_preprocessed,grandparent_header,grandparent_preprocessed,...,round,suggested_label,createdAt,updatedAt,id.1,word,tfidf,score,chapter_id,words
0,101,\nWeitere spezifische Grundlagen sind vorhande...,26,5.2,Spezifische Grundlagen\n,spezifisch grundlage,Grundlagen\n,grundlage,,,...,1000,0,2019-05-13 10:22:07,2019-05-13 12:28:07,2074.0,ausarbeitung,0.38571,471.0,101.0,ausarbeitung generalplanerangebotes grundlage ...
1,251,\nFolgende Schritte erfolgen bis zum Zuschlags...,26,7.1,Evaluationsphasen\n,evaluationsphase,Evaluation\n,evaluation,,,...,1000,0,2019-05-13 10:22:08,2019-05-13 12:28:10,4544.0,10,0.136392,2510.0,251.0,10 2 3 4 5 6 7 aktivität angebot ausschreibung...
2,509,\n2.6.1 \nAuf das vorliegende Vergabeverfahren...,36,2.6,Ausschreibungsbedingungen\n,ausschreibungsbedingung,,,,,...,1000,0,2019-05-13 10:22:04,2019-05-13 12:28:17,,,,,,
3,779,\n3.1.1 \n.110 02 Schweizerische Südostbahn AG...,7,3.1,"Bauherr, Besteller, Eigentümer.\n",bauherr besteller eigentümer,"100\tORGANISATION BAUHERR, LAGE, ZWECKBESTIMMU...",100 organisation bauherr lage zweckbestimmung,,,...,1000,0,2019-05-13 10:22:05,2019-05-13 12:28:19,,,,,,
4,800,\nDie Gasleitung wird im Bereich der Zentralst...,55,55.0,Gas\n,gas,,,,,...,1000,0,2019-05-13 10:22:03,2019-05-13 12:28:21,13165.0,110,0.25307,790.0,800.0,110 380 bereich erstellen gasleitung grabenlän...


In [53]:
X = df[relevant_columns]
X.tail()


Unnamed: 0,header_preprocessed,parent_preprocessed,grandparent_preprocessed,words
995,ts mitte steuerung treppenabgang,,,abdeckung ausführung ausgehen aussen bedienste...
996,angebotsblatt,,,abfallsammelfahrzeug anbieter anbieters aufbau...
997,montage befestigung heizungsdämmung,,,abzukleben aluminium befestigen d dichtigkeits...
998,total heizgruppe bodenheizung garderobe übrig ...,,,fr heizgruppe konvektor lichtschacht she
999,unterlage 4 fach inkl revisionspläne unterlage...,,,1 a anlageteil bauleitung bewegen ca klappen l...


In [54]:
# get df, not series
y_test = df.loc[:,['label']]
y_test.loc[y_test['label'] == 'yes'] = 1
y_test.loc[y_test['label'] == 'no'] = 0
# convert to series
y_test = y_test['label']

In [55]:
words_tfidf = words_vec.transform(X['words'])
header_tfidf = header_vec.transform(X['header_preprocessed'])
parent_tfidf = parent_vec.transform(X['parent_preprocessed'])
grandparent_tfidf = grandparent_vec.transform(X['grandparent_preprocessed'])

X_test_union = scipy.sparse.hstack([
    words_tfidf,
    header_tfidf,
    parent_tfidf,
    grandparent_tfidf
])

print(X_test_union.shape)

(1000, 7656)


In [56]:
y_test_pred = final_model.predict(X_test_union)

In [57]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test,y_test_pred))  
print(accuracy_score(y_test, y_test_pred))  

[[960   9]
 [ 16  15]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       969
           1       0.62      0.48      0.55        31

   micro avg       0.97      0.97      0.97      1000
   macro avg       0.80      0.74      0.77      1000
weighted avg       0.97      0.97      0.97      1000

0.975


In [58]:
# select second class only
y_test_pred_proba = final_model.predict_proba(X_test_union)[:,1]

In [59]:
roc_value = roc_auc_score(y_test, y_test_pred_proba)
roc_value

0.8506275175605047

## Print Classifications

In [72]:
classifications = pd.concat([df.iloc[:,0], X, y_test, pd.Series(y_test_pred, name="pred"), pd.Series(y_test_pred_proba, name="pred_proba")], axis=1)
classifications.head()

Unnamed: 0,id,header_preprocessed,parent_preprocessed,grandparent_preprocessed,words,label,pred,pred_proba
0,101,spezifisch grundlage,grundlage,,ausarbeitung generalplanerangebotes grundlage ...,0,0,0.38106
1,251,evaluationsphase,evaluation,,10 2 3 4 5 6 7 aktivität angebot ausschreibung...,0,0,0.413371
2,509,ausschreibungsbedingung,,,,1,0,0.377987
3,779,bauherr besteller eigentümer,100 organisation bauherr lage zweckbestimmung,,,0,0,0.377259
4,800,gas,,,110 380 bereich erstellen gasleitung grabenlän...,0,0,0.37791


In [73]:
confusion_type = [];
for index, row in classifications.iterrows():
    if row.label == 0 and row.pred == 0:
        confusion_type.append('TN')
    elif row.label == 1 and row.pred == 1: 
        confusion_type.append('TP')
    elif row.label == 0 and row.pred == 1: 
        confusion_type.append('FP')
    elif row.label == 1 and row.pred == 0: 
        confusion_type.append('FN')
classifications = pd.concat([classifications, pd.Series(confusion_type, name='confusion_type')], axis=1)
classifications.head()

Unnamed: 0,id,header_preprocessed,parent_preprocessed,grandparent_preprocessed,words,label,pred,pred_proba,confusion_type
0,101,spezifisch grundlage,grundlage,,ausarbeitung generalplanerangebotes grundlage ...,0,0,0.38106,TN
1,251,evaluationsphase,evaluation,,10 2 3 4 5 6 7 aktivität angebot ausschreibung...,0,0,0.413371,TN
2,509,ausschreibungsbedingung,,,,1,0,0.377987,FN
3,779,bauherr besteller eigentümer,100 organisation bauherr lage zweckbestimmung,,,0,0,0.377259,TN
4,800,gas,,,110 380 bereich erstellen gasleitung grabenlän...,0,0,0.37791,TN


In [82]:
classifications[classifications.confusion_type == 'FN']['id'].tolist()

[509,
 24218,
 28405,
 84357,
 114657,
 119280,
 130241,
 168440,
 199103,
 254381,
 272765,
 279384,
 302283,
 308760,
 319491,
 320288]

In [None]:
classifications.to_csv('predictions_1000.csv')

# select next trainingdata

In [None]:
results = []
query = '''
  SELECT *, GROUP_CONCAT(word.word SEPARATOR ' ') as words FROM chapter 
    INNER JOIN word ON chapter.id = word.chapter_id
  WHERE topic_1_id = {0}
  GROUP BY chapter.id
  ORDER BY RAND()
  LIMIT {1};
'''


for i in range(0, number_of_categories):
    print('getting data for category {}'.format(i))
    res = do_query(query.format(i, entries_per_category))
    df = pd.DataFrame(res.fetchall())
    df.columns = res.keys()
    results.append(df)
    
df = pd.concat(results, ignore_index=True)
df.head()

Calculate tfidf based on classifiers from previous model

In [None]:
next_X = df[relevant_columns]
words_tfidf = words_vec.transform(df['words'])
header_tfidf = header_vec.transform(df['header_preprocessed'])
parent_tfidf = parent_vec.transform(df['parent_preprocessed'])
grandparent_tfidf = grandparent_vec.transform(df['grandparent_preprocessed'])

X_next_union = scipy.sparse.hstack([
    words_tfidf,
    header_tfidf,
    parent_tfidf,
    grandparent_tfidf
])

union_features = words_vec.get_feature_names()\
               + header_vec.get_feature_names()\
               + parent_vec.get_feature_names()\
               + grandparent_vec.get_feature_names()


print(len(union_features))
print(X_next_union.shape) 

In [None]:
y_next_pred_proba = final_model.predict_proba(X_next_union)[:,1]
security = abs(y_next_pred_proba - 0.5) * 2
security

In [None]:
df['security'] = security
df = df.sort_values(by='security')
df.head()

## Evaluate

In [None]:
print('mean: {0}'.format(df['security'].mean()))
print('median: {0}'.format(df['security'].median()))

In [None]:
grouped_sec = pd.DataFrame(df.groupby('topic_1_id')['security'].mean())
grouped_sec['median'] = df.groupby('topic_1_id')['security'].median()
grouped_sec= grouped_sec.sort_values('median', ascending=False)
grouped_sec

## Insert Trainingdata

In [None]:
counter = 0
for i, d in df[0:entries_trainingdata].iterrows():
    counter += 1
    print('inserting trainingdata {0} of {1}'.format(counter, len(df[0:entries_trainingdata])))
    try:
        insert_trainingdata_from_chapter(
            d,
            current_category,
            current_round + 1,
            d['topic_1_id'] == zk_category_id
        )
    except: 
        e = sys.exc_info()[0]
        print(e)

# Use model for all chapters

In [None]:
query = '''
    select *, group_concat(word.word separator ' ') as words from chapter 
    inner join word on chapter.id = word.chapter_id
    group by chapter.id
'''

In [None]:
res = do_query(query)
df = pd.DataFrame(res.fetchall())
df.columns = res.keys()
df.head()

In [None]:
X = df[relevant_columns]
X.tail()

In [None]:
words_tfidf = words_vec.transform(X['words'])
header_tfidf = header_vec.transform(X['header_preprocessed'])
parent_tfidf = parent_vec.transform(X['parent_preprocessed'])
grandparent_tfidf = grandparent_vec.transform(X['grandparent_preprocessed'])

X_union = scipy.sparse.hstack([
    words_tfidf,
    header_tfidf,
    parent_tfidf,
    grandparent_tfidf
])

print(X_union.shape)

In [None]:
y = classifier.predict(X_union)

In [None]:
len(df.loc[df['y'] == 1])

In [None]:
df.loc[df['y'] == 1]

In [None]:
df.loc[df['y'] == 1].to_csv('found_zk.csv')

## plot ROC curve


In [None]:
# select second class only
y_test_pred_proba = final_model.predict_proba(X_test_union)[:,1]

y_train_pred = final_model.predict(X_train_union)
y_train_pred_proba = final_model.predict_proba(X_train_union)[:,1]

In [None]:
roc_value = roc_auc_score(y_test, y_pred_proba)
roc_value

In [None]:
def evaluate_model(predictions, probs, train_predictions, train_probs):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""
    
    baseline = {}
        
    baseline['recall'] = recall_score(test_labels, 
                                     [1 for _ in range(len(test_labels))])
    baseline['precision'] = precision_score(test_labels, 
                                      [1 for _ in range(len(test_labels))])
    baseline['roc'] = 0.5
    
    results = {}
    
    results['recall'] = recall_score(test_labels, predictions)
    results['precision'] = precision_score(test_labels, predictions)
    results['roc'] = roc_auc_score(test_labels, probs)
    
    train_results = {}
    train_results['recall'] = recall_score(train_labels, train_predictions)
    train_results['precision'] = precision_score(train_labels, train_predictions)
    train_results['roc'] = roc_auc_score(train_labels, train_probs)
    
    for metric in ['recall', 'precision', 'roc']:
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
    
    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))])
    model_fpr, model_tpr, _ = roc_curve(test_labels, probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); 
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show();

In [None]:
test_labels = y_test
train_labels = y_train
evaluate_model(y_test_pred, y_test_pred_proba, y_train_pred, y_train_pred_proba)