In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import sys
import subprocess
import os

from Preprocessing import exclude_instagram_articles, convert_dates,  filter_by_date

root_dir = subprocess.check_output('git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
sys.path.append(root_dir)

from datahandler.DataHandler import DataHandler

matches_file_path = os.path.join(root_dir, 'data/dlf/matches_dlf.csv')
actual_matches = pd.read_csv(matches_file_path)

In [2]:
dh = DataHandler("dlf")
easy_articles = dh.get_all('easy')
hard_articles = dh.get_all('hard')

easy_articles['date'] = easy_articles['date'].apply(convert_dates)
hard_articles['date'] = hard_articles['date'].apply(convert_dates)

nl_articles = exclude_instagram_articles(easy_articles)

print("Easy Articles:", len(nl_articles))
print("Hard Articles:", len(hard_articles))

Easy Articles: 50
Hard Articles: 1421


In [3]:
from ArticleVectorizer2 import ArticleVectorizer
article_vectorizer = ArticleVectorizer(ngram_range=(1,2),
                                 convert_segmented_words=True,
                                 lowercase=True,
                                 stop_words=True,
                                 non_alnum=True,
                                 capitalized_only=True, 
                                 #max_features=10, 
                                 encoding="utf-8")
default_vectorizer = ArticleVectorizer()
#articles = [article for article in hard_articles['text']]
#matrix = vectorizer.fit_transform(articles)
#vectorizer.vectorizer.vocabulary_

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
date1 = "2023-05-01"
date2 = "2025-05-15"
content = "title"

NL_A  = filter_by_date(nl_articles, date1, date2)
DLF_A = filter_by_date(hard_articles, date1, date2)

print("Easy Articles:", len(NL_A))
print("Hard Articles:", len(DLF_A))

Avectorizer = ArticleVectorizer(ngram_range=(1,2),
                                 convert_segmented_words=True,
                                 lowercase=True,
                                 stop_words=True,
                                 non_alnum=True,
                                 capitalized_only=True, 
                                 #max_df=0.1,
                                 encoding="utf-8")

Dvectorizer = ArticleVectorizer(stop_words='german')

Cvectorizer = CountVectorizer()

Tvectorizer = TfidfVectorizer()

tfidf_transformer = TfidfTransformer()

texts_easy = [article for article in NL_A[content]]
texts_hard = [article for article in DLF_A[content]]
combined_corpus = texts_easy + texts_hard

pipeline = Pipeline([
    ('vectorizer', Avectorizer),
    ('tfidf', tfidf_transformer)
])

Avectorizer.fit(texts_easy)
matrix = Avectorizer.transform(combined_corpus)
matrix1 = matrix[:len(NL_A)]
matrix2 = matrix[len(NL_A):]

cosine_similarities = cosine_similarity(matrix1, matrix2)

for i in range(len(NL_A)):
    best_match_index = cosine_similarities[i].argmax()
    best_match_score = cosine_similarities[i, best_match_index]
    print("easy:", NL_A.iloc[i]['title'])
    print(NL_A.iloc[i]['url'])
    print("hard:", DLF_A.iloc[best_match_index]['title'])
    print(DLF_A.iloc[best_match_index]['url'])
    print("score:", best_match_score)
    print()

Easy Articles: 50
Hard Articles: 1421
easy: Baerbock fordert mehr Hilfe für Palästinenser
https://www.nachrichtenleicht.de/gaza-berbock-100.html
hard: Baerbock sichert Pazifik-Staaten Hilfe zu
https://www.deutschlandfunk.de/baerbock-sichert-pazifik-staaten-hilfe-zu-100.html
score: 0.5773502691896258

easy: Bahn und Lok-Führer haben sich geeinigt
https://www.nachrichtenleicht.de/bahn-gdl-lokfuehrer-streik-einigung-102.html
hard: Deutsche Bahn muss Mehrkosten von Stuttgart 21 alleine tragen
https://www.deutschlandfunk.de/deutsche-bahn-muss-mehrkosten-von-stuttgart-21-alleine-tragen-102.html
score: 0.7071067811865475

easy: Viele Tote bei Terror-Anschlag auf Konzert-Halle bei Moskau
https://www.nachrichtenleicht.de/anschlag-moskau-106.html
hard: Parlament in Moskau verschärft Regelungen über "ausländische Agenten"
https://www.deutschlandfunk.de/parlament-in-moskau-verschaerft-regelungen-ueber-auslaendische-agenten-102.html
score: 0.4472135954999579

easy: 7 Helfer im Gaza-Streifen bei Ang