In [1]:
import numpy as np
import pandas as pd
import os, glob
import json
import re
import spacy

pattern = re.compile(r"[A-Za-zäöüÄÖÜß]\w*\s?-\sund\s[A-Za-zÀ-žäöüÄÖÜß]\w*")
folder_path = "ads_annotated/"

In [2]:
def get_all_jsonl_job_ad_files(folder_path):
    # folder_path = 'ads_annotated/'
    file_names_jsonl = []
    for filename in glob.glob(os.path.join(folder_path, "*.jsonl")):
        with open(filename, "r") as f:
            # text = f.read()
            # print (filename)
            file_names_jsonl.append(filename)
    return file_names_jsonl

In [4]:
def get_all_job_ads_from_each_jsonl(jsonl_file):
    data = []
    with open(jsonl_file) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [5]:
def generate_sentences_df_from_ad(dataset):
    sentences = []
    id = []
    year = []
    month = []
    channel = []
    creation_date = []
    pipeline_version = []
    text_valid = []
    length = []
    language = []
    pos_fine_list = []

    for jsonl in dataset:
        for attribute, value in jsonl.items():
            # print(attribute + '***********' + str(value))
            if attribute == "id":
                # print(type(value))
                id.append(value)
            if attribute == "year":
                # print(type(value))
                # yr = str(value)
                # print(yr)
                # print(type(yr))
                year.append(value)
            if attribute == "month":
                month.append(value)
            if attribute == "channel":
                channel.append(value)
            if attribute == "creation_date":
                creation_date.append(value)
            if attribute == "pipeline_version":
                pipeline_version.append(value)
            if attribute == "text_valid":
                text_valid.append(value)
            if attribute == "length":
                length.append(value)
            if attribute == "language":
                language.append(value)
            if attribute == "tokens":
                extracted_list = value
                sentence = ""
                flag = ""
                for dict in extracted_list:
                    for key, value in dict.items():
                        # print(key + "---->" + str(value))
                        # print('********')
                        # if key == 'pos_fine' and value == 'TRUNC':
                        if key == "pos_fine" and value == "TRUNC":
                            pos_fine_list.append(value)
                            flag = "yes"
                        if key == "text":
                            sentence = sentence + " " + value
                if flag == "":
                    sentence = ""

        sentences.append(sentence)

    df = pd.DataFrame(
        list(
            zip(
                id,
                year,
                month,
                channel,
                creation_date,
                pipeline_version,
                text_valid,
                length,
                language,
                sentences,
            )
        ),
        columns=[
            "id",
            "year",
            "month",
            "channel",
            "creation_date",
            "pipeline_version",
            "text_valid",
            "length",
            "language",
            "sentences",
        ],
    )

    df_new = df.drop(df[df.sentences == ""].index)
    df_new["sentences"] = df_new["sentences"].replace("\n", "", regex=True)
    return df_new

In [6]:
def get_noun_groups(pattern, doc):
    noun_groups = []
    spans = []
    for match in pattern.finditer(doc.text):
        # print(match)
        noun_groups.append(match.group(0))
        start, end = match.span()
        span = str(start) + "-" + str(end)
        spans.append(span)
        # print(span)
    return noun_groups, spans

In [7]:
def get_gs_all_job_ads(pattern, df):
    nlp = spacy.load("de_core_news_sm")
    all_noun_groups = []
    all_spans = []
    sentences_extr = df["sentences"]
    for sentence in sentences_extr:
        # print(sentence)
        doc = nlp(sentence)
        noun_groups, spans = get_noun_groups(pattern, doc)
        all_noun_groups.append(noun_groups)
        all_spans.append(spans)
    return all_noun_groups, all_spans

In [8]:
def get_gs_df(all_noun_groups, all_spans, df):
    df_new = df
    df_new["NOUNGROUPWITHELLIPSIS"] = all_noun_groups
    df_new["span_of_noun_group"] = all_spans
    df_gold_standard = df_new.explode(["NOUNGROUPWITHELLIPSIS", "span_of_noun_group"])
    df_gold_standard["NOUNGROUPWITHELLIPSIS"] = df_gold_standard[
        "NOUNGROUPWITHELLIPSIS"
    ].fillna(0)
    df_gold_standard_new = df_gold_standard.drop(
        df_gold_standard[df_gold_standard.NOUNGROUPWITHELLIPSIS == 0].index
    )
    # df_gold_standard_new.reset_index(drop=True)
    df_gold_standard_new.reset_index(drop=True, inplace=True)
    return df_gold_standard_new

In [9]:
def generate_gold_standard_df(jsonl_file):
    data = get_all_job_ads_from_each_jsonl(jsonl_file)
    df = generate_sentences_df_from_ad(data)
    all_noun_groups, all_spans = get_gs_all_job_ads(pattern, df)
    gold_standard_df = get_gs_df(all_noun_groups, all_spans, df)
    return gold_standard_df

In [10]:
def get_complete_GS_df(folder_path):
    list_file_names = get_all_jsonl_job_ad_files(folder_path)
    master_GS = []
    for jsonl_file in list_file_names:
        child_GS_df = generate_gold_standard_df(jsonl_file)
        print(len(child_GS_df))
        master_GS.append(child_GS_df)
    master_GS_df = pd.concat(master_GS)
    return master_GS_df

In [11]:
master_GS_df = get_complete_GS_df(folder_path)

32
35
82
96
583
2
139
145
147
144
3432
3572
142
131
1
1
113
91
3090
2801
184
200
111
94
67
53
778
876
81
99
199
180
2744
3213
122
117
3
180
4978
4483
98
84
397
596
63
69
154
148
986
1004
240
179
41
41
628
438
72
77
192
158
58
63
125
90
2991
3274
192
136
3842
4757
0
0


In [12]:
len(master_GS_df)

54734

In [13]:
master_GS_df

Unnamed: 0,id,year,month,channel,creation_date,pipeline_version,text_valid,length,language,sentences,NOUNGROUPWITHELLIPSIS,span_of_noun_group
0,sjmm-11950111030005,1950,3,1,2021-08-23,0.0pre3,-9,223,de,Gesucht : Auf 15. März oder April williges ...,Haus- und Gartenarbeit,73-95
1,sjmm-11950111070006,1950,3,1,2021-08-23,0.0pre3,-9,192,de,"Gesucht routinierter , zuverlässiger , ledig...",Car- und Lastwagen,78-96
2,sjmm-11950111110006,1950,3,1,2021-08-23,0.0pre3,-9,251,de,Gesucht baldmöglichst 1 Anreisser und 1 Hob...,Lehr- und Arbeitszeugnissen,174-201
3,sjmm-11950111160014,1950,3,1,2021-08-23,0.0pre3,-9,313,de,Ich suche für unsere 5 Kinder eine liebe und...,Säuglings- und Kinderpflege,170-197
4,sjmm-11950121020002,1950,3,1,2021-08-23,0.0pre3,-9,93,de,Gesucht für sofort deutsch- und französischs...,deutsch- und französischsprechende,21-55
...,...,...,...,...,...,...,...,...,...,...,...,...
4752,sjmm-32019190181478,2019,3,3,2021-08-23,0.0pre3,-9,1342,de,Polymechaniker CNC Deine Aufgabe Produkti...,Fanuc- und Haidenhein,796-817
4753,sjmm-32019190181479,2019,3,3,2021-08-23,0.0pre3,-9,3689,de,"thyssenkrupp , das sind mehr als 155'000 Mit...",Hochspannungs- und Leistungselektronik,2209-2247
4754,sjmm-32019190181479,2019,3,3,2021-08-23,0.0pre3,-9,3689,de,"thyssenkrupp , das sind mehr als 155'000 Mit...",Team- und Firmenevents,3169-3191
4755,sjmm-32019190181479,2019,3,3,2021-08-23,0.0pre3,-9,3689,de,"thyssenkrupp , das sind mehr als 155'000 Mit...",Einarbeitungs- und Weiterbildungsprogramme,3240-3282


In [39]:
master_GS_df.to_csv("gold_standard_ellipsis.csv", encoding="utf-8", index=False)