**Welcome to the Google Colab notebook for Team 15 - ML Mavericks, part of the Data Science Group Project Module at the University of Birmingham for the 2023/2024 cohort. This notebook contains all the code implementations for our system, designed to automate the creation of "See Also" sections in Wikipedia articles.**

# 1. Collecting and Preparing the Data


### 1.1. Stage One: Selecting Articles' Titles

In [None]:
import pandas as pd

# Loading the TSV file with custom column names for easier understanding. (The clickstream data we used is available on this link: https://dumps.wikimedia.org/other/clickstream/2024-01/)
df = pd.read_csv("clickstream-enwiki-2024-01.tsv", sep='\t', names=['From', 'To', 'Type', 'Total Clicks'])

# Just a quick check to see the first few rows and make sure our columns are correctly named.
print(df.head())


In [None]:
# Grouping by destination page, summing total clicks, and sorting.
result_df = df.groupby('To')['Total Clicks'].sum().reset_index().sort_values(by='Total Clicks', ascending=False)

# Renaming columns.
result_df.columns = ['Title', 'Total clicks']

# Saving the result as a CSV file with the top 60k titles.
result_df.head(60000).to_csv('top_60k_titles.csv', index=False)


## 1.2. Stage Two: Delving into Article Features

In [None]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

def fetch_xtools_article_info(project, article, session):
    article_formatted = article.replace(" ", "_")
    url = f"https://xtools.wmcloud.org/api/page/articleinfo/{project}/{article_formatted}"
    response = session.get(url)
    if response.status_code == 200:
        data = response.json()
        return {
            "revisions": data.get("revisions", 0),
            "quality": data.get("assessment", {}).get("value", "Unknown")
        }
    else:
        print(f"Couldn't get data for {article}: HTTP {response.status_code}")
        return {}

def fetch_article_details(title, session):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts|info|categories",
        "titles": title,
        "exintro": "",
        "explaintext": "",
        "inprop": "url",
        "cllimit": "max",  # Getting max categories
        "clshow": "!hidden"  # Excluding hidden categories
    }
    info_response = session.get(base_url, params=params)
    if info_response.status_code == 200:
        info_data = info_response.json()
        page_id = next(iter(info_data['query']['pages']))
        page_info = info_data['query']['pages'][page_id]

        # Extracting categories, removing 'Category:' prefix
        categories = [category['title'].replace('Category:', '').strip() for category in page_info.get('categories', []) if 'categories' in page_info]

        total_views = 0
        views_url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{title.replace(' ', '_')}/daily/20240101/20240131"
        views_response = session.get(views_url)
        if views_response.status_code == 200:
            views_data = views_response.json()
            if 'items' in views_data:
                total_views = sum(item['views'] for item in views_data['items'])
            else:
                print(f"No view data for {title}")
        else:
            print(f"Couldn't fetch view data for {title}: HTTP {views_response.status_code}")

        xtools_info = fetch_xtools_article_info("en.wikipedia.org", title, session)

        return {
            "title": title,
            "size": page_info.get('length', 0),
            "total_views": total_views,
            "first_paragraph": page_info.get('extract', ""),
            "article_quality": xtools_info.get("quality", "Unknown"),
            "article_categories": categories  # Categories list without 'Category:' prefix, excluding hidden categories
        }
    else:
        print(f"Couldn't fetch article details for {title}: HTTP {info_response.status_code}")
        return {}

def create_articles_df(titles, user_agent):
    articles_data = []
    with requests.Session() as session:
        session.headers.update({'User-Agent': user_agent})
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_title = {executor.submit(fetch_article_details, title, session): title for title in titles}
            for future in as_completed(future_to_title):
                title = future_to_title[future]
                try:
                    article_details = future.result()
                    articles_data.append(article_details)
                except Exception as exc:
                    print(f"{title} encountered an issue: {exc}")
    return pd.DataFrame(articles_data)

def process_batches(titles, user_agent, batch_size=100):
    for i in range(0, len(titles), batch_size):
        batch_titles = titles[i:i + batch_size]
        start_time = time.time()
        df = create_articles_df(batch_titles, user_agent)
        csv_filename = f"batch_{i//batch_size + 1}_articles.csv"
        df.to_csv(csv_filename, index=False)
        end_time = time.time()
        print(f"Batch {i//batch_size + 1} done: {len(batch_titles)} articles saved to {csv_filename} in {end_time - start_time:.2f} seconds.")

# User agent information
user_agent = "mah338@student.bham.ac.uk / University of Birmingham - Data Science Project"

# Load the file as CSV
top_60k_titles_csv = pd.read_csv("top_60k_titles.csv")['Title'].tolist()

# Process batches
process_batches(top_60k_titles_csv, user_agent)


## 1.3. The Final Dataset: Combining Files

In [None]:
import pandas as pd
import os

# Assuming all CSV files are in the current directory
csv_files = [f"batch_{i}_articles.csv" for i in range(1, 603)]  # Generating list of CSV file names

# Initializing an empty DataFrame
combined_df = pd.DataFrame()

# Looping through each CSV file, reading it into a DataFrame, and appending it to combined_df
for file in csv_files:
    file_path = os.path.join(path_to_csv_files, file)
    if os.path.exists(file_path):  # Checking if the file exists
        temp_df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    else:
        print(f"File {file} doesn't exist.")

# Now combined_df contains all the data from the 250 CSV files
print(f"Combined DataFrame has {len(combined_df)} instances.") #This is just to make sure

# Saving the combined DataFrame to a new CSV file
combined_df.to_csv("600_field_combined.csv", index=False)

print("All files have been combined and saved to 250_field_combined.csv.")


# 2. Exploratory Data Analysis

## 2.1 An In-depth Examination of Article Size

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
data = pd.read_csv('600_field_combined.csv')

# Plot the histogram
plt.hist(data['size'], bins=50, color='grey', edgecolor='black')
plt.axvline(x=6000, color='black', linestyle='--', label='Threshold at 6K bytes')
plt.title('Article Size Distribution')
plt.xlabel('Size (bytes)')
plt.ylabel('Frequency')
plt.xlim(0, 350000)  # Limit x-axis to max value of 'size'
plt.legend()

# Remove the box around the graph
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

# Save the plot
plt.savefig('size.png', dpi=1000, bbox_inches='tight')

# Show the plot
plt.show()

# Find the bins with the highest frequencies
sorted_indices = counts.argsort()[::-1]  # Sort indices by count
max_count_bins = [(bins[i], bins[i + 1]) for i in sorted_indices[:2]]  # Get top two bins
max_count_values = [counts[i] for i in sorted_indices[:2]]  # Get counts of top two bins

# Print info about top frequency bins
for i in range(len(max_count_bins)):
    print(f"Bin {i+1}: Range {max_count_bins[i]} with a frequency of {max_count_values[i]}")



## 2.2 An In-Depth Examination of Article Quality Distribution

In [None]:
# Group article qualities, combining less common categories into "Others"
data['article_quality_grouped'] = data['article_quality'].apply(lambda x: x if x in ['FA', 'A', 'GA', 'B', 'C', 'Start', 'Stub'] else 'Others')
quality_counts_grouped = data['article_quality_grouped'].value_counts()[['FA', 'A', 'GA', 'B', 'C', 'Start', 'Stub', 'Others']]

# Create bar plot
bars = plt.bar(quality_counts_grouped.index, quality_counts_grouped.values, color='grey', edgecolor='black')
plt.title('Article Quality Distribution', pad=20)
plt.xlabel('Article Quality')
plt.ylabel('Number of Articles')
plt.xticks(rotation=90)  # Rotate x-axis labels vertically

# Add numbers on top of bars for better visibility
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.05*yval, f'{int(yval)}',
             ha='center', va='bottom', rotation=0, color='black', fontsize=8, zorder=3)  # Set zorder to bring text to front

# Remove the box around the graph
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

# Adjust axis limits to make space for text
plt.ylim(0, max(quality_counts_grouped.values) * 1.03)
plt.tight_layout()
plt.savefig('quality.png', dpi=1000, bbox_inches='tight')

plt.show()


## 2.3 An In-Depth Examination of Article Category Distribution

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
from collections import Counter

# Define a function to safely evaluate string literals
def safe_literal_eval(x):
    if isinstance(x, str):
        try:
            return literal_eval(x)
        except Exception:
            return []
    return x

# Convert string representations of lists into actual lists
data['article_categories'] = data['article_categories'].apply(safe_literal_eval)

# Adjusted category mapping using certain words
category_mapping = {
    'Sports': [
        'olympic sport', 'football', 'basketball', 'sportspeople', 'athletic', 'soccer', 'tennis',
        'athletics', 'baseball', 'rugby', 'cricket', 'volleyball', 'golf', 'swimming',
        'track and field', 'hockey', 'table tennis', 'badminton', 'skiing', 'snowboarding',
        'skating', 'cycling', 'boxing', 'mixed martial arts', 'wrestling', 'fencing', 'rowing',
        'sailing', 'equestrian', 'gymnastics', 'weightlifting', 'biathlon', 'triathlon', 'marathon',
        'sprint', 'judo', 'taekwondo', 'karate', 'archery', 'shooting sports', 'darts', 'bowling',
        'billiards', 'snooker', 'bodybuilding', 'surfing', 'motorsport', 'racing', 'figure skating',
        'doping in sports', 'sports nutrition', 'sports medicine', 'sports psychology',
        'physical training', 'sports equipment', 'team sports', 'individual sports', 'extreme sports',
        'water sports', 'winter sports', 'outdoor sports', 'indoor sports', 'professional sports',
        'amateur sports', 'college sports', 'youth sports', 'master sports', 'olympic games', 'world cup',
        'championships', 'sports leagues', 'sports teams', 'sportsmanship', 'coaching', 'sports strategy',
        'sports analytics', 'sports history', 'sports culture', 'fan culture', 'sports broadcasting',
        'sports journalism', 'sports awards', 'sports records', 'sports events', 'stadiums', 'arenas',
        'sports fans', 'athlete training', 'sports science'
    ],
    'Technology': [
        'software', 'hardware', 'internet', 'video game', 'computer', 'programming', 'ai',
        'artificial intelligence', 'gadget', 'mobile device', 'smartphone', 'tablet', 'laptop',
        'desktop', 'operating system', 'application', 'app development', 'user interface',
        'user experience', 'data science', 'machine learning', 'robotics', 'automation', 'blockchain',
        'cryptocurrency', 'cloud computing', 'big data', 'data analysis', 'networking',
        'cybersecurity', 'information security', 'hacking', 'ethical hacking', 'virtual reality',
        'augmented reality', 'drones', 'wearable technology', 'IoT', 'Internet of Things',
        'semiconductors', 'silicon chips', 'quantum computing', 'database', 'data management',
        'UI/UX design', 'web development', 'digital marketing', 'SEO', 'search engine optimization',
        'social media', 'e-commerce', 'fintech', 'financial technology', 'tech startup', 'innovation',
        'tech policy', 'privacy', 'tech ethics', 'software engineering', 'network infrastructure',
        'wireless technology', '5G', 'telecommunications', 'nanotechnology', 'biotechnology',
        'tech trends', 'gaming consoles', 'e-sports', 'tech reviews', 'tech tutorials',
        'coding languages', 'software development kit', 'SDK', 'open source', 'API',
        'application programming interface', 'tech integration', 'tech education', 'STEM',
        'science technology engineering mathematics', 'tech entrepreneurship', 'tech investment',
        'tech venture', 'tech gadgets', 'home automation', 'smart home', 'tech support',
        'tech forums', 'tech community', 'tech events', 'tech conferences', 'tech exhibitions'
    ],
    'Literature': [
        'novel', 'poetry', 'writer', 'book', 'literary genre', 'prose', 'drama', 'playwright',
        'short story', 'biography', 'essay', 'anthology', 'classic literature', 'literary criticism',
        'literary theory', 'non-fiction', 'fiction', 'science fiction', 'fantasy', 'mystery',
        'horror', 'historical novel', 'romance', 'graphic novel', 'comic book', 'memoir',
        'autobiography', 'epic', 'sonnet', 'haiku', 'limerick', 'ballad', 'literary device',
        'narrative structure', 'plot', 'character development', 'theme', 'motif', 'symbolism',
        'dialogue', 'rhetoric', 'satire', 'parody', 'allegory', 'critique', 'manuscript', 'publishing',
        'e-book', 'audiobook', 'translation', 'book series', 'author', 'poet', 'novelist', 'editor',
        'literary journal', 'book review', 'book club', 'reading', 'literature festival', 'literary award',
        'bestseller', 'classic', 'literary canon', 'literary movement', 'literary period', 'poetic form',
        'prose style', 'literary agent', 'book fair', 'public domain', 'copyright in literature',
        'academic writing', 'scholarly publication', 'creative writing', 'young adult literature',
        'children’s literature', 'oral tradition', 'folklore', 'mythology', 'literary scholarship',
        'text analysis', 'literature education', 'literary studies'
    ],
    'History': [
        'historical event', 'ancient history', '20th century', 'historian', 'medieval history',
        'world war', 'historical figure', 'modern history', 'renaissance', 'industrial revolution',
        'civilization', 'empire', 'kingdom', 'monarchy', 'dynasty', 'archaeology', 'artifact',
        'chronology', 'historiography', 'cultural heritage', 'historical research', 'military history',
        'political history', 'social history', 'economic history', 'history of science',
        'history of technology', 'art history', 'oral history', 'primary source', 'archive', 'documentary',
        'biography', 'timeline', 'genealogy', 'prehistoric', 'classical antiquity', 'colonialism',
        'revolution', 'exploration', 'historic site', 'museum', 'historical society', 'history education',
        'public history', 'historical reenactment', 'historical fiction', 'national history',
        'local history', 'global history', 'transnational history', 'history conference',
        'history publication', 'history journal', 'historical methodology', 'chronicle', 'epic',
        'folklore', 'mythology', 'ancient texts', 'inscription', 'paleography', 'numismatics',
        'heraldry', 'philately', 'cartography', 'historical maps', 'historical narrative',
        'historical analysis', 'historical period', 'historical drama', 'historical documentary',
        'age of discovery', 'age of enlightenment', 'middle ages', 'renaissance history', 'baroque',
        'classicism', 'romanticism', 'victorian era', 'modernism', 'postmodernism', 'contemporary history',
        'digital history', 'historical simulation'
    ],
    'Politics': [
        'politician', 'political party', 'election', 'government', 'democracy', 'political science',
        'legislation', 'public policy', 'governance', 'diplomacy', 'international relations',
        'political campaign', 'voting', 'civic engagement', 'civil rights', 'human rights',
        'political ideology', 'conservatism', 'liberalism', 'socialism', 'communism', 'anarchism',
        'federalism', 'parliamentary system', 'presidential system', 'monarchy', 'dictatorship',
        'geopolitics', 'political theory', 'political economy', 'political history', 'political philosophy',
        'political ethics', 'civic education', 'political analysis', 'political strategy', 'lobbying',
        'advocacy', 'activism', 'public administration', 'bureaucracy', 'electoral system',
        'political debate', 'political discourse', 'political leadership', 'nation-state', 'sovereignty',
        'nationalism', 'patriotism', 'political culture', 'political reform', 'political crisis',
        'campaign finance', 'voter turnout', 'political sociology', 'political psychology',
        'political communication', 'political commentary', 'political journalism', 'political satire',
        'political parties', 'independent politics', 'grassroots politics', 'party politics',
        'political movement', 'political coalition', 'government institution', 'public office',
        'political office', 'election law', 'political rights', 'political representation',
        'political negotiation', 'political advocacy', 'municipal politics', 'regional politics',
        'national politics', 'international politics', 'geopolitical conflict', 'political stability',
        'political change', 'policy analysis', 'public affairs', 'political consulting',
        'electoral politics', 'political management'
    ],
    'Science': [
        'biology', 'physics', 'chemistry', 'space', 'astronomy', 'earth science', 'environment',
        'genetics', 'botany', 'zoology', 'ecology', 'molecular biology', 'biochemistry',
        'microbiology', 'neuroscience', 'evolution', 'immunology', 'cellular biology',
        'quantum mechanics', 'thermodynamics', 'particle physics', 'nuclear physics',
        'astrophysics', 'cosmology', 'planetary science', 'geochemistry', 'geophysics',
        'meteorology', 'climatology', 'oceanography', 'paleontology', 'crystallography',
        'inorganic chemistry', 'organic chemistry', 'analytical chemistry', 'physical chemistry',
        'material science', 'science research', 'scientific method', 'experimental science',
        'scientific theory', 'scientific discovery', 'natural science', 'applied science',
        'interdisciplinary science', 'scientific community', 'scientific journal',
        'peer-reviewed research', 'laboratory', 'scientific experiment', 'science education',
        'science communication', 'science policy', 'science funding', 'science and technology',
        'science history', 'science ethics', 'environmental science', 'conservation biology',
        'wildlife science', 'earth systems', 'atmospheric science', 'space exploration',
        'rocket science', 'satellite technology', 'science innovation', 'science awards',
        'scientific breakthrough', 'scientific collaboration', 'science conference',
        'science exhibition', 'science debate', 'science advocacy', 'citizen science',
        'science literacy', 'science outreach', 'scientific literacy', 'science curriculum',
        'STEM education', 'scientific investigation', 'science fair', 'science festival',
        'science workshop', 'scientific inquiry', 'science news', 'science media'
    ],
    'Health': [
        'medicine', 'medical science', 'healthcare', 'nutrition', 'disease', 'psychology',
        'wellness', 'public health', 'epidemiology', 'pathology', 'pharmacology', 'anatomy',
        'physiology', 'genetics', 'oncology', 'cardiology', 'neurology', 'dermatology',
        'endocrinology', 'gastroenterology', 'immunology', 'ophthalmology', 'pediatrics',
        'psychiatry', 'radiology', 'surgery', 'veterinary medicine', 'nursing', 'dentistry',
        'mental health', 'preventive medicine', 'alternative medicine', 'holistic health',
        'sports medicine', 'physical therapy', 'occupational therapy', 'personal health', 'fitness',
        'exercise', 'weight management', 'diet', 'supplements', 'vitamins', 'mental wellness',
        'stress management', 'self-care', 'sleep hygiene', 'hygiene', 'sexual health',
        'reproductive health', 'women"s health', 'men"s health', 'pediatric health', 'geriatric health',
        'chronic conditions', 'infectious diseases', 'vaccination', 'public health policy',
        'health education', 'community health', 'health promotion', 'health insurance', 'medical research',
        'clinical trials', 'medical diagnostics', 'health technology', 'health informatics', 'e-health',
        'telemedicine', 'patient care', 'patient safety', 'medical ethics', 'health literacy',
        'health communication', 'health facilities', 'emergency medicine', 'critical care',
        'intensive care', 'health systems', 'global health', 'environmental health', 'health disparities',
        'health economics', 'health laws', 'health regulations', 'medical devices', 'medical imaging',
        'health data', 'medical records', 'health interventions', 'health outcomes', 'health risk factors',
        'health services', 'healthcare quality', 'healthcare access', 'healthcare management'
    ],
    'Entertainment': [
    'movies', 'television', 'music', 'theater', 'comedy', 'dance', 'pop culture', 'celebrities',
    'film industry', 'TV shows', 'documentaries', 'streaming services', 'live concerts', 'festivals',
    'awards shows', 'reality TV', 'animation', 'video games', 'board games', 'nightlife',
    'performing arts', 'opera', 'ballet', 'musical theatre', 'drama', 'sitcoms', 'talk shows',
    'radio', 'podcasts', 'audiobooks', 'music videos', 'songwriting', 'record labels',
    'music production', 'concert tours', 'theatre productions', 'cinematography', 'directing',
    'screenwriting', 'playwriting', 'acting', 'stand-up comedy', 'magic', 'circus',
    'celebrity gossip', 'fan clubs', 'fan conventions', 'cosplay', 'gaming', 'e-sports',
    'art exhibitions', 'museums', 'gallery shows', 'book readings', 'literary festivals',
    'entertainment news', 'media criticism', 'film criticism', 'music criticism', 'theatre criticism',
    'celebrity interviews', 'red carpet events', 'film festivals', 'entertainment technology',
    'VR in entertainment', 'AR in entertainment', 'special effects', 'visual effects',
    'production design', 'costume design', 'makeup artistry', 'choreography', 'talent shows',
    'variety shows', 'game shows', 'sports entertainment', 'interactive entertainment',
    'theme parks', 'amusement parks', 'carnivals', 'casinos', 'gambling', 'lotteries',
    'entertainment law', 'media studies', 'entertainment industry', 'celebrity culture',
    'entertainment history', 'cultural impact of entertainment', 'influencer culture', 'social media stars'
    ]
}

def assign_general_category(specific_categories, mapping):
    general_categories = set()
    for specific in specific_categories:
        specific_words = specific.lower().split()  # Split into words and convert to lowercase for matching
        for general, specifics in mapping.items():
            for specific_word in specific_words:
                if any(specific_word == specific_mapped for specific_mapped in specifics):
                    general_categories.add(general)
    return list(general_categories)

# Assign a general category to each article
data['general_category'] = data['article_categories'].apply(lambda x: assign_general_category(x, category_mapping))

# Flatten the list of general categories for all articles to count them
all_general_categories = [category for sublist in data['general_category'] for category in sublist]

# Count the occurrences of each general category
general_category_counts = Counter(all_general_categories)

# Convert to a sorted list of tuples for easier plotting or analysis
sorted_general_category_counts = sorted(general_category_counts.items(), key=lambda x: x[1], reverse=True)

# Unpack the category names and counts for plotting
categories, counts = zip(*sorted_general_category_counts)

plt.figure(figsize=(10, 8))
plt.bar(categories, counts, color='grey', edgecolor='black')
plt.title('Article Counts by General Category')
plt.xlabel('General Category')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha="right")

# Remove the box around the graph
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.savefig('Category.png', dpi=1000, bbox_inches='tight')

plt.show()


## 2.4. An In-Depth Examination of Article Views

In [None]:
import matplotlib.pyplot as plt
import numpy as np  # This will be used for calculating histogram data

# Calculate histogram data
counts, bin_edges = np.histogram(data['total_views'], bins=1000, range=(0, 300000))

# Find the largest bin
largest_bin_index = np.argmax(counts)
largest_bin_count = counts[largest_bin_index]
largest_bin_range = (bin_edges[largest_bin_index], bin_edges[largest_bin_index + 1])

# Plotting the histogram
plt.hist(data['total_views'], bins=1000, color='grey', edgecolor='black')
plt.title('Distribution of Total Views in January 2024', loc='right', pad=20)
plt.xlabel('Total Views')
plt.ylabel('Frequency')
plt.xlim(0, 300000)

# Remove the box around the graph
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

# Annotate the largest bin
vertical_offset = largest_bin_count * 19  # Applying a large offset
annotation_y_position = largest_bin_count + vertical_offset

plt.text(largest_bin_range[0] + (largest_bin_range[1]-largest_bin_range[0])/2, annotation_y_position,
         f'{int(largest_bin_count)} articles\n({int(largest_bin_range[0])}-{int(largest_bin_range[1])} views)',
         ha='center', va='bottom')

plt.savefig('views.png', dpi=1000, bbox_inches='tight')
plt.show()


# 3. NLP and Semantic Vectors Generation

## 3.1. Semantic Vectors Generation Using BERT

In [None]:
# Note: We used Google Colab due to its computational efficiency.

!pip install sentence-transformers
import pandas as pd
from sentence_transformers import SentenceTransformer
import time
import ast  # For converting string representations of lists into actual lists

# Path to the CSV file
file_path = "/content/drive/MyDrive/Data Science - UoB/Term 2/DS Project/Dataset and code file/21th Feb NLP/600_field_combined.csv"

df = pd.read_csv(file_path)

# Selecting the 'title', 'first_paragraph', and 'article_categories' columns
df = df[['title', 'first_paragraph', 'article_categories']].iloc[0:60000].copy()

# Load the BERT model for generating embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Convert 'article_categories' from string representation of list to a single string
def convert_categories_to_string(category_list_str):
=    if isinstance(category_list_str, str):
        categories = ast.literal_eval(category_list_str)
    else:
        categories = category_list_str
    return ', '.join(categories)

# Apply the function to convert categories
df['article_categories'] = df['article_categories'].apply(convert_categories_to_string)

# Concatenate title, first paragraph, and article categories into a single text column for embedding
df['text'] = df['title'].astype(str) + ' ' + df['first_paragraph'].astype(str) + ' ' + df['article_categories'].astype(str)

# Generate BERT embeddings for each text instance
start_time = time.time()
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True, convert_to_tensor=True)
end_time = time.time()

# Create a DataFrame to store the semantic vectors
semantic_df = pd.DataFrame(embeddings.numpy(), columns=[f'bert_{i}' for i in range(len(embeddings[0]))])

# Concatenate the semantic vectors DataFrame with the DataFrame containing the original features
new_df = pd.concat([
    df.reset_index(drop=True),
    semantic_df
], axis=1)

# Calculate processing time
processing_time = end_time - start_time
print(f"Processing time: {processing_time:.2f} seconds")


## 3.2. BERT Embeddings Validation through PCA

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('600_field_combined_with_vectors.csv')

#List of specified indices for articles to include
specified_indices = [521, 520, 221, 2290, 94, 186, 4541, 1759, 21329, 28794, 20170, 38807, 81, 165, 11, 49, 3152, 2254, 10369, 10817, 12882]

# Select articles
sampled_data = data.loc[specified_indices]

# Extract BERT embeddings
embeddings = sampled_data.loc[:, 'bert_0':'bert_767']

# Perform PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Visualization
plt.figure(figsize=(12, 10))
for i, txt in enumerate(sampled_data['title']):
    if i not in [10, 2]:
        # Adjust vertical position of labels based on index
        vertical_position = reduced_embeddings[i, 1] + 0.01 if i % 2 == 0 else reduced_embeddings[i, 1] - 0.01
        plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], color='grey', edgecolors='black', alpha=0.6)
        plt.text(reduced_embeddings[i, 0] + 0.01, vertical_position, txt, fontsize=10, color='black', ha='left', va='bottom' if i % 2 == 0 else 'top')

plt.title('PCA of Article Vector Embeddings')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
plt.axvline(x=0, color='k', linestyle='-', linewidth=0.5)

# Remove the box around the graph
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.savefig('embeddings.png', dpi=1000, bbox_inches='tight')

plt.show()


# 4. Workflow of the Automated "See Also" System

## 4.1. Input Article, Cosine Similarity Calculation, and Size Exclusion

In [None]:
from sklearn.preprocessing import normalize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import pandas as pd

file_name = 'full_dataset_with_vectors.csv'

# Read the CSV file
data = pd.read_csv(file_name)

# Extract these embeddings into a NumPy array
embeddings = data.loc[:, 'bert_0':'bert_767'].values

# Normalize the embeddings to have unit length
norm_semantic_vectors = normalize(embeddings)

def recommend_articles_with_info(current_article_index, data, top_n=20):
    # Compute cosine similarity
    similarities = cosine_similarity([norm_semantic_vectors[current_article_index]], norm_semantic_vectors)[0]

    # Get indices of articles sorted by similarity (descending)
    sorted_indices = np.argsort(similarities)[::-1]

    # Exclude the current article to prevent repetition
    sorted_indices = sorted_indices[sorted_indices != current_article_index]

    # Get top 20 articles based on similarity ratio, excluding the current article
    top_20_indices = sorted_indices[:20]

    # Filter out articles with less than 6K bytes from the top 20
    filtered_indices = [i for i in top_20_indices if data.iloc[i]['size'] >= 6000]

    # Select desired features for recommendations
    selected_features = ["title", "size", "total_views", "Introduction", "article_quality", "article_categories"]

    # Create DataFrame for recommendations including desired features and similarity scores
    recommendations_df = data.iloc[filtered_indices][:top_n][selected_features].copy()
    recommendations_df['Similarity Score'] = similarities[filtered_indices][:top_n]

    # Adding current article info at the beginning
    current_article_info = pd.DataFrame({
        'title': [data.iloc[current_article_index]['title']],
        'size': [data.iloc[current_article_index]['size']],
        'total_views': [data.iloc[current_article_index]['total_views']],
        'Introduction': [data.iloc[current_article_index]['Introduction']],
        'article_quality': [data.iloc[current_article_index]['article_quality']],
        'article_categories': [data.iloc[current_article_index]['article_categories']],
        'Similarity Score': [np.nan]  # Current article won't have a similarity score with itself
    })

    return pd.concat([current_article_info, recommendations_df], ignore_index=True)

# This is an example usage
current_article_index = 301
recommendations_df = recommend_articles_with_info(current_article_index, data, top_n)



## 4.2. Sorting by Number of Shared Categories

In [None]:
import ast
original_categories = ast.literal_eval(data.iloc[current_article_index]['article_categories'])

# Function to count shared categories
def count_shared_categories(target_categories, original_categories):
    # Convert target categories from string representation of list to actual list
    target_categories_list = ast.literal_eval(target_categories)
    # Use set intersection to find common elements
    shared_categories = set(target_categories_list).intersection(set(original_categories))
    return len(shared_categories)

# Apply the function to each row in the DataFrame to calculate NSC
recommendations_df['NSC'] = recommendations_df['article_categories'].apply(lambda x: count_shared_categories(x, original_categories))


# Sort by 'NSC' in descending order, then by 'Similarity Score' in descending order
# This ensures that articles with more shared categories come first
# For articles with the same number of shared categories, they are then sorted by their similarity score
recommendations_df = recommendations_df.sort_values(by=['NSC', 'Similarity Score'], ascending=[False, False])

# Take the top 10 results after sorting
top_10_recommendations = recommendations_df.head(11)


## 4.3. Arranging by Views and Quality, and Final Selection

In [None]:
# Extract the original article (assuming it is at index 0 of the DataFrame)
original_article = top_10_recommendations.iloc[:1]

# Exclude the original article from the sorting process
rest_of_articles = top_10_recommendations.iloc[1:]

# Sort the rest of the DataFrame by 'total_views' in descending order
sorted_rest = rest_of_articles.sort_values(by='total_views', ascending=False)

# Concatenate the original article back at the top
sorted_df = pd.concat([original_article, sorted_rest])

# Reset the index of the sorted DataFrame
sorted_df.reset_index(drop=True, inplace=True)


##Now for Quality:
#mapping from quality categories to numerical scores
quality_mapping = {
    'FA': 1, 'A': 2, 'GA': 3, 'B': 4, 'C': 5, 'Start': 6, 'Stub': 7,
    'FL': 8, 'AL': 9, 'BL': 10, 'CL': 11, 'NA': 12, '???': 13
}

#Map article qualities to scores, assigning 14 to unrecognized categories
sorted_df['quality_score'] = sorted_df['article_quality'].apply(lambda x: quality_mapping.get(x, 14))

#Exclude the original article for secondary sorting based on quality
rest_of_articles_after_quality = sorted_df.iloc[1:].sort_values(by=['quality_score', 'total_views'], ascending=[True, False])

#Re-add the original article to maintain its position at the top
final_sorted_df = pd.concat([sorted_df.iloc[:1], rest_of_articles_after_quality])

#Reset the index after sorting
final_sorted_df.reset_index(drop=True, inplace=True)

#Display the final sorted DataFrame, prioritized by quality and then views
final_sorted_df.head(6)


## 4.4. Steps Combined

In [None]:
from sklearn.preprocessing import normalize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ast

# Load the dataset
file_path = 'full_dataset_with_vectors.csv'
data = pd.read_csv(file_path)

# Normalize the BERT embeddings
embeddings = data.loc[:, 'bert_0':'bert_767'].values
norm_embeddings = normalize(embeddings)

def recommend_articles(article_index, data, top_n=5):
    # Compute cosine similarity
    similarities = cosine_similarity([norm_embeddings[article_index]], norm_embeddings)[0]

    # Exclude the current article and sort others by similarity
    sorted_indices = np.argsort(similarities)[::-1][1:]

    # Select the top 20 based on similarity
    top_20_indices = sorted_indices[:20]

    # Filter out articles smaller than 6,000 bytes from the top 20
    filtered_indices = [i for i in top_20_indices if data.iloc[i]['size'] >= 6000]

    # Calculate the number of shared categories for the filtered articles
    original_categories = set(ast.literal_eval(data.iloc[article_index]['article_categories']))
    shared_counts = []
    for i in filtered_indices:
        article_categories = set(ast.literal_eval(data.iloc[i]['article_categories']))
        shared_counts.append((i, len(original_categories.intersection(article_categories))))

    # Sort by the number of shared categories, then by similarity, and take the top 10
    shared_counts.sort(key=lambda x: (-x[1], -similarities[x[0]]))
    top_10_indices = [i[0] for i in shared_counts][:10]

    # Exclude the original article from the next steps
    if article_index in top_10_indices:
        top_10_indices.remove(article_index)

    # Prepare the recommendations DataFrame
    recommendations = data.iloc[top_10_indices].copy()
    recommendations['similarity'] = similarities[top_10_indices]

    # Sort by 'total_views' in descending order and select top 5
    recommendations = recommendations.sort_values(by='total_views', ascending=False).head(5)

    # Map article qualities to numerical scores and sort
    quality_scores = {'FA': 1, 'A': 2, 'GA': 3, 'B': 4, 'C': 5, 'Start': 6, 'Stub': 7, 'List': 8}
    recommendations['quality_score'] = recommendations['article_quality'].map(lambda x: quality_scores.get(x, 9))
    final_recommendations = recommendations.sort_values(by='quality_score', ascending=True)

    return final_recommendations

# Example usage
article_index = 53
final_recommendations_df = recommend_articles(article_index, data, top_n=5)
final_recommendations_df


# 5. Current See Also Lists Extraction


## 5.1. Extracting Current See Also Sections For All Articles Using Web Scraping

In [None]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup  # For parsing HTML
import math
import time

def fetch_section_content(title, session, section_title="See also"):
    sections_url = "https://en.wikipedia.org/w/api.php"
    sections_params = {
        "action": "parse",
        "page": title,
        "prop": "sections",
        "format": "json"
    }
    sections_response = session.get(sections_url, params=sections_params)
    see_also_section_index = None
    if sections_response.status_code == 200:
        sections_data = sections_response.json()
        for section in sections_data["parse"]["sections"]:
            if section["line"].lower() == section_title.lower():
                see_also_section_index = section["index"]
                break

    see_also_articles = []
    if see_also_section_index:
        content_params = {
            "action": "parse",
            "page": title,
            "section": see_also_section_index,
            "format": "json",
            "prop": "text"
        }
        content_response = session.get(sections_url, params=content_params)
        if content_response.status_code == 200:
            content_data = content_response.json()
            html_content = content_data["parse"]["text"]["*"]
            soup = BeautifulSoup(html_content, "html.parser")
            links = soup.find_all('a')
            for link in links:
                link_text = link.get_text().strip()
                if link_text and link_text.lower() != "edit":  # Filtering out 'edit' links and empty strings
                    see_also_articles.append(link_text)

    return see_also_articles

def fetch_article_see_also(title, session):
    see_also_articles = fetch_section_content(title, session)
    return {
        "title": title,
        "see_also_articles": see_also_articles,
        "see_also_count": len(see_also_articles)  # Count of see also articles
    }

def create_articles_see_also_df(titles, user_agent):
    articles_data = []
    total_articles = len(titles)
    articles_processed = 0
    batch_start_time = time.time()

    with requests.Session() as session:
        session.headers.update({'User-Agent': user_agent})
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_title = {executor.submit(fetch_article_see_also, title, session): title for title in titles}

            for future in as_completed(future_to_title):
                articles_processed += 1
                title = future_to_title[future]
                try:
                    article_see_also_details = future.result()
                    articles_data.append(article_see_also_details)
                except Exception as exc:
                    print(f"{title} generated an exception: {exc}")

                # Update the progress and timing after every 250 articles processed
                if articles_processed % 250 == 0 or articles_processed == total_articles:
                    batch_end_time = time.time()
                    elapsed_time = batch_end_time - batch_start_time
                    progress_percentage = (articles_processed / total_articles) * 100
                    print(f"Progress: {articles_processed}/{total_articles} articles processed ({math.floor(progress_percentage)}%). Time for batch: {elapsed_time:.2f} seconds.")
                    batch_start_time = time.time()

                # Save every 1000 articles as a CSV
                if articles_processed % 1000 == 0 or articles_processed == total_articles:
                    batch_df = pd.DataFrame(articles_data)
                    batch_file_name = f'articles_see_also_{articles_processed//1000}.csv'
                    batch_df.to_csv(batch_file_name, index=False)
                    print(f"Saved {batch_file_name}")

    # The final DataFrame is saved after completing the loop
    if articles_processed % 1000 != 0:
        final_df = pd.DataFrame(articles_data)
        final_batch_number = (articles_processed // 1000) + 1
        final_file_name = f'articles_see_also_{final_batch_number}.csv'
        final_df.to_csv(final_file_name, index=False)
        print(f"Saved {final_file_name}")

#usage
user_agent = "mah338@student.bham.ac.uk"
titles_list = data['title'].tolist()
df = create_articles_see_also_df(titles_list, user_agent)
df

