In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import sys
import subprocess
import os

root_dir = subprocess.check_output('git rev-parse --show-toplevel'.split()).decode('utf-8').strip()
sys.path.append(root_dir)

from datahandler.DataHandler import DataHandler

matches_file_path = os.path.join(root_dir, 'data/mdr/matches_mdr.csv')
actual_matches = pd.read_csv(matches_file_path)

In [2]:
dh = DataHandler("mdr")
easy_articles = dh.get_all('easy')
hard_articles = dh.get_all('hard')

In [3]:
all_articles = pd.concat([easy_articles['text'], hard_articles['text']])

# Vectorize the text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(all_articles)

# Apply NMF for topic modeling
nmf = NMF(n_components=10, random_state=1)
nmf_features = nmf.fit_transform(tfidf_matrix)

# Normalize the NMF features
nmf_features = normalize(nmf_features)

# Separate the transformed features back into easy and hard sets
easy_features = nmf_features[:len(easy_articles)]
hard_features = nmf_features[len(easy_articles):]

# Compute cosine similarity between easy and hard articles
similarity_matrix = cosine_similarity(easy_features, hard_features)

# Find the best matches
matches = similarity_matrix.argmax(axis=1)

# Extract URLs for matched articles
easy_urls = easy_articles['url'].apply(lambda url: dh.search_by("easy", "url", url))
hard_urls = hard_articles.iloc[matches]['url'].apply(lambda url: dh.search_by("hard", "url", url))

# Create a DataFrame with the matches
matches_df = pd.DataFrame({
    'easy': easy_urls,
    'hard': hard_urls
})

# Save the matches to a CSV file
matches_df.to_csv('matched_articles.csv', index=False)