#A Needle in a Data Haystack (67978) - Final Project
#Analyzing and Predicting Trends in Academic Research




Noa Ben Gallim (noa.bengallim@mail.huji.ac.il)

Itamar Edelstein (itamar.edelstein@mail.huji.ac.il)

Idan Hippach (idan.hippach@mail.huji.ac.il)


In [1]:
import json
import os
import ast
import re
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import defaultdict, Counter
from networkx.algorithms.community import louvain_communities

##Preprocessing

In [None]:
def preprocess(filename):
    """
    Preprocess on each txt file individually and after that we combine them
    """
    total_lines = sum(1 for _ in open(filename, 'r', encoding='utf-8'))
    data = []

    with open(filename, 'r', encoding='utf-8') as file:
        for line in tqdm(file, total=total_lines, desc="Processing file"):
            record = json.loads(line.strip())
            data.append(record)

    df = pd.DataFrame(data)

    df = df.drop_duplicates(subset='id')
    df.drop(columns=['venue', 'page_start', 'page_end', 'doc_type', 'publisher',
                     'issue', 'volume', 'url', 'doi', 'indexed_abstract',
                     'references', 'abstract'], inplace=True, errors='ignore')
    df.dropna(inplace=True)

    # Filter by year
    df = df[df['year'] >= 2000]
    df = df[df['year'] < 2020]

    # Check if all authors have both 'org' and 'org_id'
    def all_authors_have_name_and_id(author_list):
        return all('org' in author and 'org_id' in author for author in author_list)

    df = df[df['authors'].apply(all_authors_have_name_and_id)]

    df.to_csv(f'{filename}_preprocessed.csv', index=False, encoding='utf-8')


In [None]:
def combine_dataframes(folder_path, df_names):
    """
    Combine all the mag papers preprecessed dataframes into one dataframe
    """
    df_list = []

    for df_name in df_names:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        df = df.dropna(subset=['authors', 'fos'])  # Remove rows where 'authors' or 'fos' contain NaN
        df_list.append(df)

    # Concatenate all DataFrames into one DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_csv(f'{folder_path}/mag_papers_combined.csv')
    return combined_df


Once you have downloaded the MAG papers files, follow these steps:  
1. Process each file individually using the `preprocess` function.  
2. After preprocessing, merge all the resulting files into a single DataFrame.  

In [None]:
mag_papers_files = ['mag_papers.txt']  # Replace with the names of your MAG papers files

for file_name in mag_papers_files:
    preprocess(file_name)

In [None]:
preprocessed_mag_papers_files = ['mag_papers_preprocessed.csv']  # Replace with the names of your preprocessed files
mag_papers = combine_dataframes('your_mag_papers_folder_path', preprocessed_mag_papers_files)

We will focus exclusively on the 'year' and 'fos' fields. To simplify processing, we'll extract the 'fos' names into a string format for easier manipulation:

In [None]:
def get_fos_names(fos_list):
    """
    Extracts fields of science (FoS) names from a string containing a list-like
    structure and returns a list of extracted FoS names as strings
    """
    reg = r"'[a-zA-Z ]+',"
    names = re.findall(reg, fos_list)
    return [name[1:-2] for name in names]

In [None]:
mag_papers = mag_papers.dropna(subset=['id', 'title', 'authors', 'year', 'fos'])
mag_papers['fos_names'] = mag_papers['fos'].apply(get_fos_names)

fos_data = mag_papers[['year', 'fos_names']]
fos_data['fos_names'] = fos_data['fos_names'].apply(lambda x: ', '.join(x))

##1. Visualizing Top Fields of Study

In [None]:
def top_fos_bar_graph(fos_data, top_n=10):
    """
    Visualize the bar graph of the top n fields of study
    """
    all_fos = ', '.join(fos_data['fos_names']).split(', ')
    fos_counter = Counter(all_fos)

    fos_df = pd.DataFrame(fos_counter.items(), columns=['fos_name', 'count'])
    fos_df = fos_df.sort_values(by='count', ascending=False)

    plt.figure(figsize=(10, 6))
    plt.bar(fos_df['fos_name'][:top_n],
            fos_df['count'][:top_n],
            color='skyblue')

    plt.xlabel('Fields of Study')
    plt.ylabel('Count')
    plt.title(f'Top {top_n} Fields of Study Across All Years')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Run the following code to visualize the top fields of study:

In [None]:
top_fos_bar_graph(fos_data)

##Visualize Research Trends Over Time

In [None]:
def get_fos_per_year(fos_data):
    """
    Calculate FoS counts per year
    """
    fos_per_year = {}

    for year, group in fos_data.groupby('year'):
        fos_names = [fos for fos_list in group['fos_names'] for fos in fos_list.split(', ')]

        if year not in fos_per_year:
            fos_per_year[year] = Counter(fos_names)
        else:
            fos_per_year[year].update(fos_names)

    return fos_per_year

In [None]:
def plot_fos_trends(fos_data, top_n=10):
    """
    Plot the trends for the top n Fields of Study over time.
    """
    fos_per_year = get_fos_per_year(fos_data)
    total_fos_counter = Counter()

    for year_counter in fos_per_year.values():
        total_fos_counter.update(year_counter)

    top_fos = [fos for fos, count in total_fos_counter.most_common(top_n)]

    year_range = sorted(map(int, fos_per_year.keys()), reverse=True)
    year_range = year_range[:-1]
    fos_trends = {fos: [] for fos in top_fos}

    for year in year_range:
        year_counter = fos_per_year.get(year, Counter())
        for fos in top_fos:
            fos_trends[fos].append(year_counter.get(fos, 0))

    plt.figure(figsize=(10, 6))
    for fos in top_fos:
        plt.plot(year_range, fos_trends[fos], label=fos)

    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.title(f'Trends for Top {top_n} Fields of Study Over Time')
    plt.xticks(year_range)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

Run the following code to visualize the research trends over the years:

In [None]:
plot_fos_trends(fos_data)

##3. Clustering Research Fields with K-Means

In [None]:
def kmeans_clustering(data, X, n_clusters):
    """
    Clusters the data using K-means and visualizes the clusters with word clouds.
    X is a vectorized representation of the fos_names column from the data.
    """
    fos_list = data['fos_names']

    # K-means clustering to group fields into general categories
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data[f'category_cluster_{n_clusters}'] = kmeans.fit_predict(X)

    clusters = kmeans.labels_  # cluster labels for each keyword

    fig, axes = plt.subplots(nrows=int(n_clusters/2),
                             ncols=2,
                             figsize=(20, 40))

    for cluster, ax in enumerate(axes.flatten()):
        terms_in_cluster = ', '.join(fos_list[data[f'category_cluster_{n_clusters}'] == cluster])
        terms_dict = Counter(terms_in_cluster.split(', '))

        # Plot a wordcloud for this cluster
        wordcloud = WordCloud(width=800,
                              height=400,
                              background_color='white').generate_from_frequencies(terms_dict)

        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Cluster {cluster}', fontsize=12)
        ax.axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
def custom_tokenizer(text):
    """
    A custom tokenizer for splitting text based on commas.
    """
    return [token.strip() for token in text.split(', ')]


# Use a TfidfVectorizer to transform the fos_names column into a TF-IDF matrix
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, token_pattern=None)
X = vectorizer.fit_transform(fos_data['fos_names'])

In [None]:
def clustering(fos_data, n_clusters):
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, token_pattern=None)
    X = vectorizer.fit_transform(fos_data['fos_names'])

    for n in n_clusters:
        kmeans_clustering(fos_data, X, n_clusters=n)


Execute the code below to perform K-Means clustering on research fields and visualize the clusters using word clouds:

In [None]:
n_clusters = []  # Specify the numbers of clusters you want to explore
clustering(fos_data, n_clusters)

##4. Community Detection in Research Fields

In [None]:
def collaboration_graph(data):
    """
    Constructs a collaboration graph where each node is a field of science (FoS),
    and an edge between nodes represents co-occurrence of FoS in the same paper.
    The weight of the edge corresponds to the number of co-occurrences.
    """
    G = nx.Graph()

    for index, row in data.iterrows():
        fields_of_study = row['fos_names'].split(', ')

        # Add edges between all fields in this paper
        for i in range(len(fields_of_study)):
            for j in range(i + 1, len(fields_of_study)):
                G.add_edge(fields_of_study[i],
                           fields_of_study[j],
                           weight=G.get_edge_data(fields_of_study[i],
                                                  fields_of_study[j],
                                                  default={'weight': 0})['weight'] + 1)
    return G

In [None]:
def get_aggregated_graph(G, communities):
    """
    Aggregates the original collaboration graph by collapsing nodes
    into their respective communities
    """
    aggregated_graph = nx.Graph()
    community_map = {}

    # Assign nodes to communities
    for i, community in enumerate(communities):
        aggregated_graph.add_node(i, size=len(community))
        for node in community:
            community_map[node] = i

    # Add edges between communities
    for u, v in G.edges():
        u_community = community_map[u]
        v_community = community_map[v]
        if u_community != v_community:
            if aggregated_graph.has_edge(u_community, v_community):
                aggregated_graph[u_community][v_community]['weight'] += 1
            else:
                aggregated_graph.add_edge(u_community, v_community, weight=1)

    return aggregated_graph

In [None]:
def plot_communities(G, communities, title, labels):
    """
    Visualizes the aggregated graph of communities
    """
    pos = nx.circular_layout(G)

    colors = cm.rainbow(np.linspace(0, 1, len(communities)))
    node_colors = [colors[i] for i in range(len(communities))]
    sizes = [G.nodes[node]['size'] for node in G.nodes()]
    node_color_map = [node_colors[node] for node in G.nodes()]
    weights = [G[u][v]['weight'] * 0.0001 for u, v in G.edges()]

    plt.figure(figsize=(15, 15))

    nx.draw_networkx_nodes(G, pos, node_size=sizes, node_color=node_color_map)
    nx.draw_networkx_edges(G, pos, width=weights)
    nx.draw_networkx_labels(G, pos, labels, font_size=12)

    plt.title(title, fontsize=20)

    plt.show()

In [None]:
def get_community_labels(communities):
    """
    Generates labels for each community based on the fields of s
    cience present within the community
    """
    labels = dict()
    for community_idx, community in enumerate(communities):
        fields = []
        if 'mathematics' in community:
            fields.append('Mathematics')
        if 'physics' in community:
            fields.append('Physics')
        if 'chemistry' in community:
            fields.append('Chemistry')
        if 'materials science' in community:
            fields.append('Materials Science')
        if 'medicine' in community:
            fields.append('Medicine')
        if 'biology' in community:
            fields.append('Biology')
        if 'humanities' in community:
            fields.append('Social Science\nand Humanities')
        if 'engineering' in community:
            fields.append('Engineering')
        if 'geology' in community:
            fields.append('Geology')
        if 'environmental science' in community:
            fields.append('Environmental Science')
        if 'computer science' in community:
            fields.append('Computer Science')

        if len(fields) == 0:
            labels[community_idx] = ',\n'.join(list(community)[:3])
        else:
            labels[community_idx] = '\n'.join(fields)

    return labels

In [None]:
def get_fos_counts_per_community(communities, data):
    """
    Count the frequency of FOS terms for each community

    This function is used to determine the labels for the communities by printing
    the top 10 most common FOS terms and their counts for each community.
    Although it is not part of the community detection pipeline, it is left here for
    documentation purposes to help in understanding and interpreting community labels.
    """
    community_fos_counts = []

    for community_idx, community in enumerate(communities):
        fos_in_community = []
        for index, row in data.iterrows():
            fields_of_study = row['fos_names'].split(', ')

            # Add to the community if the FOS terms belong to the community
            for fos in fields_of_study:
                if fos in community:
                    fos_in_community.append(fos)

        fos_count = Counter(fos_in_community)  # Count the occurrences of each FOS in the community
        community_fos_counts.append(fos_count)

    return community_fos_counts


def print_most_common_fos_in_communities(community_fos_counts):
    """
    Print the most common FOS terms for each community.

    This function is used to determine the labels for the communities by printing
    the top 10 most common FOS terms and their counts for each community.
    Although it is not part of the community detection pipeline, it is left here for
    documentation purposes to help in understanding and interpreting community labels.
    """
    for community_idx, fos_counter in enumerate(community_fos_counts):
        common_fos = fos_counter.most_common(10)
        common_fos_str = ', '.join([f'{fos} ({count})' for fos, count in common_fos])
        print(f"Community {community_idx}: {common_fos_str}")

Use the code below to identify communities in research fields for your chosen time range and visualize them:

In [None]:
start_year, end_year = 2000, 2020  # Set the desired start and end years

for year in range(start_year, end_year):
    collaboration_G = collaboration_graph(fos_data[fos_data['year'] == year])
    communities = louvain_communities(collaboration_G)
    aggregated_graph = get_aggregated_graph(collaboration_G, communities)
    labels = get_community_labels(communities)
    plot_communities(aggregated_graph,
                     communities,
                     title=f'Louvain Communities for Year {year}',
                     labels=labels)