In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import sys
import subprocess
import os

root_dir = subprocess.check_output('git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
sys.path.append(root_dir)

from datahandler.DataHandler import DataHandler

matches_file_path = os.path.join(root_dir, 'data/dlf/matches_dlf.csv')
actual_matches = pd.read_csv(matches_file_path)

In [2]:
dh = DataHandler("dlf")
easy_articles = dh.get_all('easy')
hard_articles = dh.get_all('hard')

In [3]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text, language='german')
    
    # Remove punctuation and numbers
    tokens = [token for token in tokens if token.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('german'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [4]:
import nltk
nltk.download('stopwords')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np

[nltk_data] Downloading package stopwords to /home/simon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
all_articles = pd.concat([easy_articles, hard_articles])
preprocessed_corpus = [preprocess_text(article) for article in all_articles['text']]

In [6]:
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_corpus)

# Access TF-IDF matrix
print(tfidf_matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming tfidf_matrix contains the TF-IDF representation of your documents
# Compute cosine similarity between all pairs of documents
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Get indices of the pairs with the highest cosine similarity scores
indices = [(i, j) for i in range(len(cosine_similarities)) for j in range(i+1, len(cosine_similarities))]
indices.sort(key=lambda x: cosine_similarities[x[0], x[1]], reverse=True)


# Get the indices of the pair with the highest cosine similarity
highest_similarity_index = indices[0]

# Get the documents corresponding to the highest similarity pair
doc1_index, doc2_index = highest_similarity_index
doc1 = all_articles.iloc[doc1_index]
doc2 = all_articles.iloc[doc2_index]


In [12]:
pp_easy = [preprocess_text(article) for article in easy_articles['text']]
pp_hard = [preprocess_text(article) for article in hard_articles['text']]

max_corpus_size = max(len(pp_easy), len(pp_hard))

vectorizer1 = TfidfVectorizer(max_features=max_corpus_size)
vectorizer2 = TfidfVectorizer(max_features=max_corpus_size)

matrix1 = vectorizer1.fit_transform(pp_easy)
matrix2 = vectorizer2.fit_transform(pp_hard)

cosine_similarities = cosine_similarity(matrix1, matrix2)

In [13]:
pp_easy = [preprocess_text(article) for article in easy_articles['title']]
pp_hard = [preprocess_text(article) for article in hard_articles['title']]

#if len(pp_easy) < len(pp_hard):
#    pp_easy.extend([''] * (len(pp_hard) - len(pp_easy)))
#elif len(pp_hard) < len(pp_easy):
#    pp_hard.extend([''] * (len(pp_easy) - len(pp_hard)))

max_corpus_size = max(len(pp_easy), len(pp_hard))

vectorizer = TfidfVectorizer()
pp_combined = pp_easy + pp_hard


matrix = vectorizer.fit_transform(pp_combined)
matrix1 = matrix[:len(pp_easy)]
matrix2 = matrix[len(pp_easy):]

cosine_similarities = cosine_similarity(matrix1, matrix2)

best_matches = []
for i in range(len(pp_easy)):
    best_match_index = cosine_similarities[i].argmax()
    best_match_score = cosine_similarities[i, best_match_index]
    print("easy:", easy_articles.iloc[i]['title'])
    print(easy_articles.iloc[i]['url'])
    print("hard:", hard_articles.iloc[best_match_index]['title'])
    print(hard_articles.iloc[i]['url'])
    print("score:", best_match_score)
    print()
    best_match = (pp_easy[i], pp_hard[best_match_index], best_match_score)
    best_matches.append(best_match)


easy: Baerbock fordert mehr Hilfe für Palästinenser
https://www.nachrichtenleicht.de/gaza-berbock-100.html
hard: Baerbock sichert Pazifik-Staaten Hilfe zu
https://www.deutschlandfunk.de/abwicklung-ueber-smartphones-dominiert-mittlerweile-den-online-einkauf-102.html
score: 0.5421168576047318

easy: Bahn und Lok-Führer haben sich geeinigt
https://www.nachrichtenleicht.de/bahn-gdl-lokfuehrer-streik-einigung-102.html
hard: Deutsche Bahn muss Mehrkosten von Stuttgart 21 alleine tragen
https://www.deutschlandfunk.de/afd-bundesvorstand-beantragt-ausschlussverfahren-gegen-bayerischen-landtagsabgeordneten-halemba-102.html
score: 0.27715097681785006

easy: Viele Tote bei Terror-Anschlag auf Konzert-Halle bei Moskau
https://www.nachrichtenleicht.de/anschlag-moskau-106.html
hard: Parlament in Moskau verschärft Regelungen über "ausländische Agenten"
https://www.deutschlandfunk.de/auslandspresse-kritisiert-al-dschasira-verbot-in-israel-100.html
score: 0.2926129339440721

easy: Cannabis
ist in Deutsc