### Build the dataframe from the raw files

In [1]:
# Import required packages
import pandas as pd
import os
import xml.etree.ElementTree as ET
import spacy
import re
from lingua import Language, LanguageDetectorBuilder

In [29]:
def parse_xml(xml_file):
    global empty_counter

    # Load the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Define the namespace
    namespace = {
        'atom': 'http://www.w3.org/2005/Atom',
        'nitf': 'http://iptc.org/std/NITF/2006-10-18/',
        'dc': 'http://purl.org/dc/elements/1.1/'
    }

    # Extract the ID
    nexis_id = root.find('.//atom:id', namespace).text
    nexis_id = nexis_id.split('Item:')[1]
    nexis_id = nexis_id.replace("-", "")

    # Extract the title
    title = root.find('.//atom:title', namespace).text

    # Extract the published date
    published_date = root.find('.//atom:published', namespace).text
    published_date = published_date.split('T')[0]

    # Extract the publisher
    publisher = root.find('.//metadata/publicationInfo/publicationName').text


    # Extract the body text
    body_text = root.find('.//nitf:body.content//bodyText//p', namespace)

    # Extract all body paragraphs and join them into a single line
    body_paragraphs = []
    body_text_element = root.find('.//nitf:body.content//bodyText', namespace)
    if body_text_element is not None:
        for p_element in body_text_element.findall('.//p', namespace):
            if p_element.text:
                body_paragraphs.append(p_element.text.strip())
    text = ' '.join(body_paragraphs)

    # Check for empty text bodies and return
    if text == '':
        #print('No text body in file ' + file_path)
        empty_counter += 1
        return

    # Create a dictionary with the extracted values
    data_dict = {
        'nexis_id': [nexis_id],
        'title': [title],
        'publication_date': [published_date],
        'publisher': [publisher],
        'text': [text]
    }

    df = pd.DataFrame(data_dict)
    return df

In [30]:
# Root Directory of the data
folder_path = 'c:\\Friendcloud\\_University\\_Masterarbeit\\Data\\Nexis\\'

# Initialize empty counter
empty_counter = 0

# Go through all subdirectories and fill raw_data with the data
raw_data = []
for root, _, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.xml'):
            try:
                file_path = os.path.join(root, file)
                data = parse_xml(file_path)
                raw_data.append(data)
            except:
                print('Error in file ' + file_path)

# Build pandas DataFrame
df_raw = pd.concat(raw_data, ignore_index=True)
print('Number of articles removed: ' + str(empty_counter))
print('Number of articles accepted: ' + str(len(df_raw.index)))

Number of articles removed: 90
Number of articles accepted: 6779


In [31]:
df_raw

Unnamed: 0,nexis_id,title,publication_date,publisher,text
0,5MNMH621JB0GF09H0000000,Angst vor dem «harten Brexit» auf der Insel - ...,2017-01-15,AP Deutsch,Wenn Großbritannien Ende März den Ausstieg aus...
1,5SM3THB1DXFJ50MP0000000,Fleischindustrie wehrt sich gegen Marketing fü...,2018-06-21,AP Deutsch,Fleisch aus dem Labor ist noch eine Zukunftsvi...
2,5SM3THB1DXFJ50MY0000000,Der AP-Überblick am Nachmittag,2018-06-21,AP Deutsch,Die AP Weltnachrichten haben heute unter ander...
3,5STNVWH1DXFJ53VM0000000,Laborfleisch soll in drei Jahren auf die Telle...,2018-07-17,AP Deutsch,Maastricht (AP) - Das niederländische Unterneh...
4,5VHK2XG1JB0GF4Y50000000,Israelische Forscher wollen künstliche Steaks ...,2019-02-26,AP Deutsch,"Die Weltbevölkerung wächst, die Nachfrage nach..."
...,...,...,...,...,...
6774,67KW1VK1F15WB4660000000,Kein Titel,2023-02-21,ZEIT Wissen,6 Am anfang drei Fragen 1. Können wir andere m...
6775,67KW1VK1F15WB46B0000000,Leben und schmecken lassen,2023-02-21,ZEIT Wissen,"Ein saftiges Filet, für das kein Huhn sterben ..."
6776,7W29GN20YC2460S30000000,ABSCHIED VOM ALTEN ITALIEN,2009-05-13,ZEIT Wissen,John Dickie: »Delizia! Die Italiener und ihre...
6777,7X8DW4712SK2G0H10000000,Essen aus dem Labor,2009-12-08,ZEIT Wissen,Es ist der letzte Tag auf der Lebensmittelmess...


In [None]:
df_raw.to_pickle("../Data/df_raw.pkl")

### Basic Data preprocessing

In [3]:
df_processed = pd.read_pickle("../Data/df_raw.pkl")

df_processed

Unnamed: 0,nexis_id,title,publication_date,publisher,text
0,5MNMH621JB0GF09H0000000,Angst vor dem «harten Brexit» auf der Insel - ...,2017-01-15,AP Deutsch,Wenn Großbritannien Ende März den Ausstieg aus...
1,5SM3THB1DXFJ50MP0000000,Fleischindustrie wehrt sich gegen Marketing fü...,2018-06-21,AP Deutsch,Fleisch aus dem Labor ist noch eine Zukunftsvi...
2,5SM3THB1DXFJ50MY0000000,Der AP-Überblick am Nachmittag,2018-06-21,AP Deutsch,Die AP Weltnachrichten haben heute unter ander...
3,5STNVWH1DXFJ53VM0000000,Laborfleisch soll in drei Jahren auf die Telle...,2018-07-17,AP Deutsch,Maastricht (AP) - Das niederländische Unterneh...
4,5VHK2XG1JB0GF4Y50000000,Israelische Forscher wollen künstliche Steaks ...,2019-02-26,AP Deutsch,"Die Weltbevölkerung wächst, die Nachfrage nach..."
...,...,...,...,...,...
6774,67KW1VK1F15WB4660000000,Kein Titel,2023-02-21,ZEIT Wissen,6 Am anfang drei Fragen 1. Können wir andere m...
6775,67KW1VK1F15WB46B0000000,Leben und schmecken lassen,2023-02-21,ZEIT Wissen,"Ein saftiges Filet, für das kein Huhn sterben ..."
6776,7W29GN20YC2460S30000000,ABSCHIED VOM ALTEN ITALIEN,2009-05-13,ZEIT Wissen,John Dickie: »Delizia! Die Italiener und ihre...
6777,7X8DW4712SK2G0H10000000,Essen aus dem Labor,2009-12-08,ZEIT Wissen,Es ist der letzte Tag auf der Lebensmittelmess...


In [4]:
# Add year of publication to dataset
df_processed['publication_date'] = pd.to_datetime(df_processed['publication_date'], errors='coerce')
df_processed['publication_year'] = df_processed['publication_date'].dt.year
df_processed = df_processed[['nexis_id', 'title', 'publication_date', 'publication_year', 'publisher', 'text']]

In [5]:
# Remove articles with duplicate id
count_before = len(df_processed.index)
df_processed = df_processed.drop_duplicates(subset='nexis_id', keep='first')
count_after = len(df_processed.index)
print('Number of articles removed: ' + str(count_before - count_after))

Number of articles removed: 3


In [6]:
# Find examples of articles with duplicate text
df_processed[df_processed.duplicated(subset='text', keep=False)].sample(10)

Unnamed: 0,nexis_id,title,publication_date,publication_year,publisher,text
6707,609WC4P1JBN9716M0000000,Olivenöl und Reis würden zuerst ausgehen,2020-07-10,2020,Wiler Zeitung,Bis vor kurzem war das Szenario in der Schweiz...
3657,7T50VNF12SGXD0820000000,Nummer sicher,2008-01-26,2008,Main-Taunus-Kurier (Germany),Der Bundestag gibt grünes Licht für das neue G...
1022,42KBNX2000SV03YR0000000,,2001-03-12,2001,Der Spiegel,An ein Steak oder eine Roulade traut sich auch...
416,5KR4P0M1DYHXW30J0000000,"""Die Bayer-Monsanto-Fusion ist beunruhigend""",2016-09-17,2016,Bergische Morgenpost,Johannes Remmel Der NRW-Umweltminister sieht R...
4936,5SF7YFP1F16T21FB0000000,"Einen sauberen Burger, bitte!",2018-05-30,2018,St.Galler Tagblatt (Stammausgabe),Lebensmittel Fleischkonsum ohne schlechtes Ge...
844,64HSNP51JBK924760000000,Hersteller von Fleischersatz faszinieren Anleg...,2022-01-13,2022,Börse online,"Pflanzen-Burger statt Weihnachtsgans, Pilz-Ste..."
6719,65T6DDY1DY4BY14H0000000,Bühler spannt mit mehreren Unternehmen zusammen,2022-06-29,2022,Wiler Zeitung,Thomas Griesser Kym Fleischersatz Beim Techno...
4115,57F2G9B1JCN5V4K50000000,WISSENSCHAFT & TECHNIK,2013-01-07,2013,NZZ Folio,"Erst käme der Blitz, dann der Knall. Häuser un..."
4662,5X509TF1F13R82VP0000000,Käfer & Co. als Leckerbissen,2019-09-28,2019,Ried Echo,"Hmmm, lecker sieht der Burger aus! So richtig ..."
2967,62YYVY51F13R80MY0000000,Eine Frage des Vertrauens Kruschel erklärt´s :...,2021-06-22,2021,Lampertheimer Zeitung (Germany),Vor 20 Jahren wurde es eingeführt und dürfte j...


In [7]:
# Remove articles with duplicate text
count_before = len(df_processed.index)
df_processed = df_processed.drop_duplicates(subset='text', keep='first')
count_after = len(df_processed.index)
print('Number of articles removed: ' + str(count_before - count_after))

Number of articles removed: 1278


In [8]:
# Remove articles with length less than 100 characters
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['text'].str.len() > 100]
count_after = len(df_processed.index)
print('Number of articles removed: ' + str(count_before - count_after))

Number of articles removed: 73


In [9]:
# Remove articles which are not in German using lingua
detector = LanguageDetectorBuilder.from_all_spoken_languages().build()

# Add empty language column to dataset
df_processed['language'] = ''

# Detect language of each article
for index, row in df_processed.iterrows():
    text = row['text']
    language = detector.detect_language_of(text)
    df_processed.at[index, 'language'] = language.iso_code_639_1.name

# Print the language and publisher of articles which are not in German
print(df_processed[df_processed['language'] != 'DE']['language'].value_counts())
print(df_processed[df_processed['language'] != 'DE']['publisher'].value_counts())

# Remove articles which are not in German
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['language'] == 'DE']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

language
EN    118
ES      1
Name: count, dtype: int64
publisher
dpa-AFX ProFeed                                                              19
News Bites - People in Business                                              12
American Banking and Market News                                             10
Spiegel Online                                                                7
Industry SnapShot                                                             4
GJAE - German Journal of Agricultural Economics (ehemals Agrarwirtschaft)     4
Agrarwirtschaft                                                               4
Newstex Blogs                                                                 4
Industry SnapShot Summary                                                     3
PR Newswire                                                                   2
MENAFN - Market Reports (English)                                             2
MENAFN - Press Releases (English)                      

In [10]:
# Remove articles with title 'Programmübersicht'
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['title'] != 'Programmübersicht']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with title 'Programmübersicht Samstag'
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['title'] != 'Programmübersicht Samstag']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with title 'Programmhinweise'
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['title'] != 'Programmhinweise']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with specific nexis_ids (tv program)
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['nexis_id'] != '4SC3XK90TXHH10SS0000000']
df_processed = df_processed[df_processed['nexis_id'] != '4SC3XKC0TXHH101J0000000']
df_processed = df_processed[df_processed['nexis_id'] != '4SC3XKJ0TXHH101K0000000']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with specific nexis_ids (theatre)
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['nexis_id'] != '4JXWDKF0TWRXK1R10000000']
df_processed = df_processed[df_processed['nexis_id'] != '3SGDYHS0006XC4P80000000']
df_processed = df_processed[df_processed['nexis_id'] != '5KNN3CD1JC3P04FD0000000']
df_processed = df_processed[df_processed['nexis_id'] != '4KF5FDD0TWRXK1WY0000000']
df_processed = df_processed[df_processed['nexis_id'] != '585XW971JBPW93GX0000000']
df_processed = df_processed[df_processed['nexis_id'] != '4S4Y46D0TWX2707B0000000']
df_processed = df_processed[df_processed['nexis_id'] != '554MD281F19FX2YB0000000']
df_processed = df_processed[df_processed['nexis_id'] != '4S554KS0TWX271450000000']
df_processed = df_processed[df_processed['nexis_id'] != '518R7XC1F19FX3P10000000']
df_processed = df_processed[df_processed['nexis_id'] != '539ND9G1DYK6Y0950000000']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove all articles that have the word "Dissertationen" in the title (list of dissertations, not an article)
count_before = len(df_processed.index)
df_processed = df_processed[~df_processed['title'].str.contains('Dissertationen', na=False)]
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove all articles that have the word "Börsentag" in the title (not an article)
count_before = len(df_processed.index)
df_processed = df_processed[~df_processed['title'].str.contains('Börsentag', na=False)]
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove all articles that have the word "Börsen-Ticker" in the title (not an article)
count_before = len(df_processed.index)
df_processed = df_processed[~df_processed['title'].str.contains('Börsen-Ticker', na=False)]
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with specific nexis_ids (no article)
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['nexis_id'] != '65S924K1JBR841DW0000000']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with specific nexis_ids (hidden duplicate)
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['nexis_id'] != '64HSHSG1F15WB1NC0000000']
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles with publication_year before 1994 (too little data)
count_before = len(df_processed.index)
df_processed = df_processed[df_processed['publication_year'] >= 1994]
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# Remove articles from the last 4 months before the last article published (includes web-crawled data)
count_before = len(df_processed.index)
# Find the last article published
last_article = df_processed['publication_date'].max()
# Remove articles from the last 4 months before the last article published
df_processed = df_processed[df_processed['publication_date'] < last_article - pd.DateOffset(months=4)]
count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))


Number of articles removed: 4

Number of articles removed: 1

Number of articles removed: 2

Number of articles removed: 3

Number of articles removed: 10

Number of articles removed: 2

Number of articles removed: 6

Number of articles removed: 30

Number of articles removed: 1

Number of articles removed: 1

Number of articles removed: 9

Number of articles removed: 625


In [11]:
# Reindex the dataframe
df_processed = df_processed.reset_index(drop=True)

# Remove the language column
df_processed = df_processed.drop(columns=['language'])

In [12]:
count_before = len(df_processed.index)
drop_ids = []

# Remove all articles that have the same first 100 characters (hidden duplicates)
for i in range(len(df_processed)):
    for j in range(i+1, len(df_processed)):
        text1 = df_processed['text'][i]
        text2 = df_processed['text'][j]
        if len(text1) >= 100 and len(text2) >= 100:
            if text1[:100] == text2[:100]:
                # Check whether the articles have the same nexis_id
                if df_processed['nexis_id'][i] != df_processed['nexis_id'][j]:
                    # Remove the shorter article
                    if len(text1) <= len(text2):
                        drop_ids.append(df_processed['nexis_id'][i])
                    else:
                        drop_ids.append(df_processed['nexis_id'][j])

# Remove articles with nexis_ids in drop_ids
for nexis_id in drop_ids:
    df_processed = df_processed[df_processed['nexis_id'] != nexis_id]

count_after = len(df_processed.index)
print('\nNumber of articles removed: ' + str(count_before - count_after))

# reindex dataframe
df_processed.reset_index(drop=True, inplace=True)


Number of articles removed: 226


In [13]:
df_processed

Unnamed: 0,nexis_id,title,publication_date,publication_year,publisher,text
0,5MNMH621JB0GF09H0000000,Angst vor dem «harten Brexit» auf der Insel - ...,2017-01-15,2017,AP Deutsch,Wenn Großbritannien Ende März den Ausstieg aus...
1,5SM3THB1DXFJ50MP0000000,Fleischindustrie wehrt sich gegen Marketing fü...,2018-06-21,2018,AP Deutsch,Fleisch aus dem Labor ist noch eine Zukunftsvi...
2,5SM3THB1DXFJ50MY0000000,Der AP-Überblick am Nachmittag,2018-06-21,2018,AP Deutsch,Die AP Weltnachrichten haben heute unter ander...
3,5STNVWH1DXFJ53VM0000000,Laborfleisch soll in drei Jahren auf die Telle...,2018-07-17,2018,AP Deutsch,Maastricht (AP) - Das niederländische Unterneh...
4,5VHK2XG1JB0GF4Y50000000,Israelische Forscher wollen künstliche Steaks ...,2019-02-26,2019,AP Deutsch,"Die Weltbevölkerung wächst, die Nachfrage nach..."
...,...,...,...,...,...,...
4381,67KW1VK1F15WB4660000000,Kein Titel,2023-02-21,2023,ZEIT Wissen,6 Am anfang drei Fragen 1. Können wir andere m...
4382,67KW1VK1F15WB46B0000000,Leben und schmecken lassen,2023-02-21,2023,ZEIT Wissen,"Ein saftiges Filet, für das kein Huhn sterben ..."
4383,7W29GN20YC2460S30000000,ABSCHIED VOM ALTEN ITALIEN,2009-05-13,2009,ZEIT Wissen,John Dickie: »Delizia! Die Italiener und ihre...
4384,7X8DW4712SK2G0H10000000,Essen aus dem Labor,2009-12-08,2009,ZEIT Wissen,Es ist der letzte Tag auf der Lebensmittelmess...


In [14]:
df_processed.to_pickle("../Data/df.pkl")

### Text Clean up

In [15]:
VARIANT = "LARGE" # "SMALL", "MEDIUM" , "FULL"

if VARIANT == "SMALL":
    df = pd.read_pickle("../Data/df.pkl")
    df = df.head(20)
elif VARIANT == "MEDIUM":
    df = pd.read_pickle("../Data/df.pkl")
    df = df.head(500)
else:
    df = pd.read_pickle("../Data/df.pkl")

In [16]:
df

Unnamed: 0,nexis_id,title,publication_date,publication_year,publisher,text
0,5MNMH621JB0GF09H0000000,Angst vor dem «harten Brexit» auf der Insel - ...,2017-01-15,2017,AP Deutsch,Wenn Großbritannien Ende März den Ausstieg aus...
1,5SM3THB1DXFJ50MP0000000,Fleischindustrie wehrt sich gegen Marketing fü...,2018-06-21,2018,AP Deutsch,Fleisch aus dem Labor ist noch eine Zukunftsvi...
2,5SM3THB1DXFJ50MY0000000,Der AP-Überblick am Nachmittag,2018-06-21,2018,AP Deutsch,Die AP Weltnachrichten haben heute unter ander...
3,5STNVWH1DXFJ53VM0000000,Laborfleisch soll in drei Jahren auf die Telle...,2018-07-17,2018,AP Deutsch,Maastricht (AP) - Das niederländische Unterneh...
4,5VHK2XG1JB0GF4Y50000000,Israelische Forscher wollen künstliche Steaks ...,2019-02-26,2019,AP Deutsch,"Die Weltbevölkerung wächst, die Nachfrage nach..."
...,...,...,...,...,...,...
4381,67KW1VK1F15WB4660000000,Kein Titel,2023-02-21,2023,ZEIT Wissen,6 Am anfang drei Fragen 1. Können wir andere m...
4382,67KW1VK1F15WB46B0000000,Leben und schmecken lassen,2023-02-21,2023,ZEIT Wissen,"Ein saftiges Filet, für das kein Huhn sterben ..."
4383,7W29GN20YC2460S30000000,ABSCHIED VOM ALTEN ITALIEN,2009-05-13,2009,ZEIT Wissen,John Dickie: »Delizia! Die Italiener und ihre...
4384,7X8DW4712SK2G0H10000000,Essen aus dem Labor,2009-12-08,2009,ZEIT Wissen,Es ist der letzte Tag auf der Lebensmittelmess...


In [17]:
# Load the German language model in Spacy
nlp = spacy.load('de_core_news_sm')

# Define a function to clean the text
def clean_text(text):
    # Remove special characters
    text = re.sub("[^A-Za-zäöüÄÖÜß ]+", ' ', text)

    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Lemmatize the text
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_.lower() if token.lemma_ != '--' else token.text.lower() for token in doc])   

    # If the first word has the label "GPE", remove it
    if doc[0].ent_type_ == 'GPE':
        print('GPE found')
        lemmatized_text = lemmatized_text.split(' ', 1)[1]
        
    return lemmatized_text

# Apply the clean_text function to the 'text' column in df_processed and save as a new dataframe df_clean
df_full = df.copy()
df_full['clean_text'] = df_full['text'].apply(clean_text)

In [18]:
# Remove articles with duplicate clean_text
count_before = len(df_full.index)
df_full = df_full.drop_duplicates(subset='clean_text', keep='first')
count_after = len(df_full.index)
print('Number of articles removed: ' + str(count_before - count_after))

# reindex dataframe
df_full.reset_index(drop=True, inplace=True)

Number of articles removed: 21


In [19]:
# Add word count column to dataframe
df_full['word_count'] = 0

# Iterate over the dataframe and count the number of words in each text
for index in df_full.index:
    doc = nlp(df_full['clean_text'][index])
    df_full.loc[index, 'word_count'] = len(doc)

In [20]:
df_full

Unnamed: 0,nexis_id,title,publication_date,publication_year,publisher,text,clean_text,word_count
0,5MNMH621JB0GF09H0000000,Angst vor dem «harten Brexit» auf der Insel - ...,2017-01-15,2017,AP Deutsch,Wenn Großbritannien Ende März den Ausstieg aus...,wenn großbritannien ende märz der ausstieg aus...,777
1,5SM3THB1DXFJ50MP0000000,Fleischindustrie wehrt sich gegen Marketing fü...,2018-06-21,2018,AP Deutsch,Fleisch aus dem Labor ist noch eine Zukunftsvi...,fleisch aus der labor sein noch ein zukunftsvi...,718
2,5SM3THB1DXFJ50MY0000000,Der AP-Überblick am Nachmittag,2018-06-21,2018,AP Deutsch,Die AP Weltnachrichten haben heute unter ander...,der ap weltnachrichten haben heute unter ander...,618
3,5STNVWH1DXFJ53VM0000000,Laborfleisch soll in drei Jahren auf die Telle...,2018-07-17,2018,AP Deutsch,Maastricht (AP) - Das niederländische Unterneh...,maastricht ap der niederländisch unternehmen m...,224
4,5VHK2XG1JB0GF4Y50000000,Israelische Forscher wollen künstliche Steaks ...,2019-02-26,2019,AP Deutsch,"Die Weltbevölkerung wächst, die Nachfrage nach...",der weltbevölkerung wachsen der nachfrage nach...,638
...,...,...,...,...,...,...,...,...
4360,67KW1VK1F15WB4660000000,Kein Titel,2023-02-21,2023,ZEIT Wissen,6 Am anfang drei Fragen 1. Können wir andere m...,an anfang drei frage können wir anderer mit ge...,276
4361,67KW1VK1F15WB46B0000000,Leben und schmecken lassen,2023-02-21,2023,ZEIT Wissen,"Ein saftiges Filet, für das kein Huhn sterben ...",ein saftig filet für der kein huhn sterben mus...,266
4362,7W29GN20YC2460S30000000,ABSCHIED VOM ALTEN ITALIEN,2009-05-13,2009,ZEIT Wissen,John Dickie: »Delizia! Die Italiener und ihre...,john dickie delizia der italiener und ihr küch...,840
4363,7X8DW4712SK2G0H10000000,Essen aus dem Labor,2009-12-08,2009,ZEIT Wissen,Es ist der letzte Tag auf der Lebensmittelmess...,es sein der letzter tag auf der lebensmittelme...,2544


In [21]:
df_full.to_pickle("../Data/df_full.pkl")

### Text Export

In [2]:
VARIANT = "LARGE" # "SMALL", "MEDIUM" , "FULL"

if VARIANT == "SMALL":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(20)
elif VARIANT == "MEDIUM":
    df = pd.read_pickle("../Data/df_full.pkl")
    df = df.head(500)
else:
    df = pd.read_pickle("../Data/df_full.pkl")

In [3]:
# Retain only the columns 'nexis_id', 'title', 'publication_date',  'publisher', 'text', 'clean_text'
df = df[['nexis_id', 'title', 'publication_date', 'publication_year', 'publisher', 'text', 'clean_text']]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df.to_csv('../Data/df.csv', sep='\t', encoding='utf-8-sig', index=False)

In [4]:
# Retain only publications from before 2004
df_pre2004 = df[df['publication_year'] < 2004]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_pre2004.to_csv('../Data/df_pre2004.csv', sep='\t', encoding='utf-8-sig', index=False)

In [5]:
# Retain only publications from 2004
df_2004 = df[df['publication_year'] == 2004]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_2004.to_csv('../Data/df_2004.csv', sep='\t', encoding='utf-8-sig', index=False)

In [6]:
# Retain only publications from after 2004
df_post2004 = df[df['publication_year'] > 2004]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_post2004.to_csv('../Data/df_post2004.csv', sep='\t', encoding='utf-8-sig', index=False)

In [7]:
# Retain only publications from before 2009
df_first_half = df[df['publication_year'] < 2009]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_first_half.to_csv('../Data/df_first_half.csv', sep='\t', encoding='utf-8-sig', index=False)

In [8]:
# Retain only publications from after 2009
df_second_half = df[df['publication_year'] >= 2009]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_second_half.to_csv('../Data/df_second_half.csv', sep='\t', encoding='utf-8-sig', index=False)

In [9]:
# Retain only publications from the year 2020 (COVID)
df_2020 = df[df['publication_year'] == 2020]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_2020.to_csv('../Data/df_2020.csv', sep='\t', encoding='utf-8-sig', index=False)

In [10]:
# Retain only publications from after the year 2020 including 2020 (COVID)
df_post2020 = df[df['publication_year'] >= 2020]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_post2020.to_csv('../Data/df_post2020.csv', sep='\t', encoding='utf-8-sig', index=False)

In [11]:
# Retain only publications from the year 2001 (Meat)
df_2001 = df[df['publication_year'] == 2001]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_2001.to_csv('../Data/df_2001.csv', sep='\t', encoding='utf-8-sig', index=False)

In [12]:
# Retain only publications from the year 2009 (Climate)
df_2009 = df[df['publication_year'] == 2009]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_2009.to_csv('../Data/df_2009.csv', sep='\t', encoding='utf-8-sig', index=False)

In [13]:
# Retain only publications from the year 2013 (Climate)
df_2013 = df[df['publication_year'] == 2013]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_2013.to_csv('../Data/df_2013.csv', sep='\t', encoding='utf-8-sig', index=False)

In [14]:
# Retain only publications from the year 2019 (Climate)
df_2019 = df[df['publication_year'] == 2019]

# Export the dataframe to a csv file while fixing the encoding for german characters and using tab as a separator
df_2019.to_csv('../Data/df_2019.csv', sep='\t', encoding='utf-8-sig', index=False)