# Visualizing Research Trends with bar and line graphs

In [1]:
from tqdm import tqdm  # Import tqdm for progress bar
import json
import pandas as pd
import ast
from collections import Counter
import matplotlib.pyplot as plt
import os

ModuleNotFoundError: No module named 'tqdm'

In [None]:
def preprocess(filename):
    """
    Preprocess on each txt file individually and after that we combine them
    """
    file_path = filename

    total_lines = sum(1 for _ in open(file_path, 'r', encoding='utf-8'))
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, total=total_lines, desc="Processing file"):
            record = json.loads(line.strip())
            data.append(record)
    df = pd.DataFrame(data)

    df = df.drop_duplicates(subset='id')
    df.drop(columns=['venue', 'page_start', 'page_end', 'doc_type', 'publisher', 'issue', 'volume',
                     'url', 'doi', 'indexed_abstract', 'references', 'abstract'], inplace=True, errors='ignore')
    df.dropna(inplace=True)

    df = df[df['year'] >= 2000]
    df = df[df['year'] < 2020]

    def all_authors_have_name_and_id(author_list):
        return all('org' in author and 'org_id' in author for author in author_list)

    df = df[df['authors'].apply(all_authors_have_name_and_id)]

    print(df.columns)

    df.to_csv('mag_papers_preprocessed.csv', index=False, encoding='utf-8')
    df.to_pickle('mag_papers_preprocessed.pkl')


def combine_dataframes(folder_path):
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    df_list = []

    for file_name in file_list:
        print(file_name)
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        df = df.dropna(subset=['authors', 'fos'])
        df['authors'] = df['authors'].apply(ast.literal_eval)
        df['fos'] = df['fos'].apply(ast.literal_eval)
        df_list.append(df)

    # Concatenate all DataFrames into one DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_csv(f'{folder_path}/mag_papers_combined.csv')


def save_to_pickle(df, filename='mag_papers_combined.pkl'):
    """Save DataFrame to a pickle file."""
    df.to_pickle(filename)


def load_or_read_csv_or_pickle():
    """Load DataFrame from Pickle if available, otherwise read from CSV."""
    pickle_file = 'mag_papers_combined.pkl'  # this is the file with all the data

    if os.path.exists(pickle_file):
        df = pd.read_pickle(pickle_file)
    else:
        df = pd.read_csv('mag_papers_combined.csv', encoding='utf-8', low_memory=False)
        save_to_pickle(df, pickle_file)

    return df


######
# bar plot
######
def process_fos_in_chunks_given_df(df, chunk_size=10000):
    """
    Process the 'fos' column in chunks and return a Counter object.
    :param df: the dataframe
    :param chunk_size: how many records at a time
    :return: Counter object
    """
    fos_counter = Counter()

    # divide the df into smaller chunks
    for start in range(0, len(df), chunk_size):
        end = min(start + chunk_size, len(df))
        chunk = df.iloc[start:end]

        chunk = chunk[pd.notna(chunk['fos'])]
        chunk = chunk[chunk['year'] < 2020]
        chunk['fos'] = chunk['fos'].apply(ast.literal_eval)

        fos_names = [fos['name'] for fos_list in chunk['fos'] for fos in fos_list]
        fos_counter.update(fos_names)

    return fos_counter


def viz_bar_graph(top_n=10, chunk_size=10000):
    """
    visualize the bar graph
    :param top_n: how many fields of study to visualize
    :param chunk_size: how many records to process at a time
    :return: None
    """
    df = load_or_read_csv_or_pickle()

    fos_counter = process_fos_in_chunks_given_df(df, chunk_size)

    fos_df = pd.DataFrame(fos_counter.items(), columns=['fos_name', 'count'])
    fos_df = fos_df.sort_values(by='count', ascending=False)

    plt.figure(figsize=(10, 6))
    plt.bar(fos_df['fos_name'][:top_n], fos_df['count'][:top_n], color='skyblue')
    plt.xlabel('Fields of Study')
    plt.ylabel('Count')
    plt.title(f'Top {top_n} Fields of Study Across All Years')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('viz.png')
    plt.show()

    # saving counter to json file for future use
    with open('fos_counter.json', 'w') as f:
        json.dump(fos_counter, f)


def process_fos_in_chunks(chunk_size=10000):
    """
    Process df in chunks and calculate FoS counts per year.
    """
    fos_per_year = {}

    df = pd.read_pickle('mag_papers_combined.pkl')

    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]

        chunk = chunk[pd.notna(chunk['fos'])]
        chunk = chunk[chunk['year'] < 2020]
        chunk['fos'] = chunk['fos'].apply(ast.literal_eval)

        for year, group in chunk.groupby('year'):
            fos_names = [fos['name'] for fos_list in group['fos'] for fos in fos_list]
            if year not in fos_per_year:
                fos_per_year[year] = Counter(fos_names)
            else:
                fos_per_year[year].update(fos_names)

    return fos_per_year


######
# line plot
######

def process_fos_per_year_in_chunks(chunk_size=10000):
    """Process DataFrame in chunks and calculate FoS counts per year."""
    fos_per_year = {}

    #df = pd.read_pickle('mag_papers_combined.pkl')
    df = load_or_read_csv_or_pickle()

    # Process the df in chunks
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]

        chunk = chunk[pd.notna(chunk['fos'])]
        chunk = chunk[chunk['year'] < 2020]
        chunk['fos'] = chunk['fos'].apply(ast.literal_eval)

        for year, group in chunk.groupby('year'):
            fos_names = [fos['name'] for fos_list in group['fos'] for fos in fos_list]
            if year not in fos_per_year:
                fos_per_year[year] = Counter(fos_names)
            else:
                fos_per_year[year].update(fos_names)

    return fos_per_year


def plot_fos_trends(fos_per_year, top_n=10):
    """
    Plot the trends for the top N Fields of Study over time.
    :param fos_per_year: the dictionary with Fields of Study counts per year
    :param top_n:
    :return:
    """
    total_fos_counter = Counter()
    for year_counter in fos_per_year.values():
        total_fos_counter.update(year_counter)

    top_fos = [fos for fos, count in total_fos_counter.most_common(top_n)]

    year_range = sorted(fos_per_year.keys())
    year_range = year_range[:-1]
    fos_trends = {fos: [] for fos in top_fos}

    for year in year_range:
        if int(year) < 2000 or int(year) > 2019:
            continue
        year_counter = fos_per_year[year]
        for fos in top_fos:
            fos_trends[fos].append(year_counter.get(fos, 0))

    plt.figure(figsize=(10, 6))
    for fos in top_fos:
        plt.plot(year_range, fos_trends[fos], label=fos)

    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.title(f'Trends for Top {top_n} Fields of Study Over Time')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('fos_trends.png')
    plt.show()


def save_fos_per_year_to_file(fos_per_year, filename='fos_per_year.json'):
    """Save the fos_per_year dictionary to a JSON file."""
    if os.path.exists(filename):
        return
    with open(filename, 'w') as f:
        fos_per_year_serializable = {int(year): dict(counter) for year, counter in fos_per_year.items()}
        json.dump(fos_per_year_serializable, f)


def viz_with_trends(top_n=10, chunk_size=10000, save_file='fos_per_year.json'):
    """
    Load file and Visualize the trends for the top N Fields of Study over time.
    :param top_n: how many fields of study to visualize
    :param chunk_size: how many records to process at a time
    :param save_file: where to save the processed data
    :return: None
    """
    fos_per_year = process_fos_per_year_in_chunks(chunk_size)

    # saving it to a file for future use
    save_fos_per_year_to_file(fos_per_year, save_file)
    plot_fos_trends(fos_per_year, top_n)


Run the following code to visualize the research trends with bar and line graphs.

In [None]:
filename = 'mag_papers.txt'  # choose the file
preprocess(filename)
folder_name = ''
combine_dataframes(folder_name)
viz_bar_graph(top_n=10)
viz_with_trends(top_n=10)

# Community Detection in Research Fields

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import networkx as nx
import re
from networkx.algorithms.community import louvain_communities
from collections import Counter

In [None]:
def get_fos_names(fos_list):
    """
    Extracts fields of science (FoS) names from a string containing a list-like
    structure and returns a list of extracted FoS names as strings
    """
    reg = r"'[a-zA-Z ]+',"
    names = re.findall(reg, fos_list)
    return [name[1:-2] for name in names]


def get_fos_counts_per_community(communities, data):
    """
    Count the frequency of FOS terms for each community

    This function is used to determine the labels for the communities by printing 
    the top 10 most common FOS terms and their counts for each community. 
    Although it is not part of the community detection pipeline, it is left here for 
    documentation purposes to help in understanding and interpreting community labels.
    """
    community_fos_counts = []

    for community_idx, community in enumerate(communities):
        fos_in_community = []
        for index, row in data.iterrows():
            fields_of_study = row['fos_names'].split(', ')

            # Add to the community if the FOS terms belong to the community
            for fos in fields_of_study:
                if fos in community:
                    fos_in_community.append(fos)

        fos_count = Counter(fos_in_community)  # Count the occurrences of each FOS in the community
        community_fos_counts.append(fos_count)

    return community_fos_counts


def print_most_common_fos_in_communities(community_fos_counts):
    """
    Print the most common FOS terms for each community.

    This function is used to determine the labels for the communities by printing 
    the top 10 most common FOS terms and their counts for each community. 
    Although it is not part of the community detection pipeline, it is left here for 
    documentation purposes to help in understanding and interpreting community labels.
    """
    for community_idx, fos_counter in enumerate(community_fos_counts):
        common_fos = fos_counter.most_common(10)
        common_fos_str = ', '.join([f'{fos} ({count})' for fos, count in common_fos])
        print(f"Community {community_idx}: {common_fos_str}")


def collaboration_graph(data):
    """
    Constructs a collaboration graph where each node is a field of science (FoS),
    and an edge between nodes represents co-occurrence of FoS in the same paper.
    The weight of the edge corresponds to the number of co-occurrences.
    """
    G = nx.Graph()

    for index, row in data.iterrows():
        fields_of_study = row['fos_names'].split(', ')

        # Add edges between all fields in this paper
        for i in range(len(fields_of_study)):
            for j in range(i + 1, len(fields_of_study)):
                G.add_edge(fields_of_study[i],
                           fields_of_study[j],
                           weight=G.get_edge_data(fields_of_study[i],
                                                  fields_of_study[j],
                                                  default={'weight': 0})['weight'] + 1)
    return G


def get_aggregated_graph(G, communities):
    """
    Aggregates the original collaboration graph by collapsing nodes
    into their respective communities
    """
    aggregated_graph = nx.Graph()
    community_map = {}

    # aAsign nodes to communities
    for i, community in enumerate(communities):
        aggregated_graph.add_node(i, size=len(community))
        for node in community:
            community_map[node] = i

    # Add edges between communities
    for u, v in G.edges():
        u_community = community_map[u]
        v_community = community_map[v]
        if u_community != v_community:
            if aggregated_graph.has_edge(u_community, v_community):
                aggregated_graph[u_community][v_community]['weight'] += 1
            else:
                aggregated_graph.add_edge(u_community, v_community, weight=1)

    return aggregated_graph


def plot_communities(G, communities, title, path, labels):
    """
    Visualizes the aggregated graph of communities
    """
    pos = nx.circular_layout(G)

    colors = cm.rainbow(np.linspace(0, 1, len(communities)))
    node_colors = [colors[i] for i in range(len(communities))]
    sizes = [G.nodes[node]['size'] for node in G.nodes()]
    node_color_map = [node_colors[node] for node in G.nodes()]
    weights = [G[u][v]['weight'] * 0.0001 for u, v in G.edges()]

    plt.figure(figsize=(15, 15))

    nx.draw_networkx_nodes(G,
                           pos,
                           node_size=sizes,
                           node_color=node_color_map)

    nx.draw_networkx_edges(G,
                           pos,
                           width=weights)

    nx.draw_networkx_labels(G, pos, labels, font_size=12)

    plt.title(title, fontsize=20)

    plt.savefig(path)
    plt.show()


def get_community_labels(communities):
    """
    Generates labels for each community based on the fields of s
    cience present within the community
    """
    labels = dict()
    for community_idx, community in enumerate(communities):
        fields = []
        if 'mathematics' in community:
            fields.append('Mathematics')
        if 'physics' in community:
            fields.append('Physics')
        if 'chemistry' in community:
            fields.append('Chemistry')
        if 'materials science' in community:
            fields.append('Materials Science')
        if 'medicine' in community:
            fields.append('Medicine')
        if 'biology' in community:
            fields.append('Biology')
        if 'humanities' in community:
            fields.append('Social Science\nand Humanities')
        if 'engineering' in community:
            fields.append('Engineering')
        if 'geology' in community:
            fields.append('Geology')
        if 'environmental science' in community:
            fields.append('Environmental Science')
        if 'computer science' in community:
            fields.append('Computer Science')

        if len(fields) == 0:
            labels[community_idx] = ',\n'.join(list(community)[:3])
        else:
            labels[community_idx] = '\n'.join(fields)

    return labels

Run the following code to detect communities in research fields and visualize them.

In [None]:
mag_papers = pd.read_csv('mag_papers_combined.csv')
mag_papers = mag_papers.dropna(subset=['id', 'title', 'authors', 'year', 'fos'])
mag_papers['fos_names'] = mag_papers['fos'].apply(get_fos_names)

fos_data = mag_papers[['year', 'fos_names']]
fos_data['fos_names'] = fos_data['fos_names'].apply(lambda x: ', '.join(x))

for year in range(2000, 2020):
    collaboration_G = collaboration_graph(fos_data[fos_data['year'] == year])
    communities = louvain_communities(collaboration_G)
    aggregated_graph = get_aggregated_graph(collaboration_G, communities)
    labels = get_community_labels(communities)
    plot_communities(aggregated_graph,
                     communities,
                     title=f'Louvain Communities for Year {year}',
                     path=f'plots/louvain_{year}.png',
                     labels=labels)

# Clustering Research Papers with K-Means

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import networkx as nx
import ast
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import defaultdict, Counter
from networkx.algorithms.community import louvain_communities

In [None]:
def kmeans_clustering(data, X, n_clusters):
    fos_list = data['fos_names']

    # K-means clustering to group fields into general categories
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data[f'category_cluster_{n_clusters}'] = kmeans.fit_predict(X)

    clusters = kmeans.labels_  # cluster labels for each keyword

    fig, axes = plt.subplots(nrows=int(n_clusters / 2),
                             ncols=2,
                             figsize=(20, 40))

    clusters_dict = dict()

    for cluster, ax in enumerate(axes.flatten()):
        terms_in_cluster = ', '.join(fos_list[data[f'category_cluster_{n_clusters}'] == cluster])
        terms_dict = Counter(terms_in_cluster.split(', '))
        common_terms = ', '.join([term for term, count in terms_dict.most_common(10)])
        clusters_dict[cluster] = common_terms

        # Plot a word cloud for this cluster
        wordcloud = WordCloud(width=800,
                              height=400,
                              background_color='white').generate_from_frequencies(terms_dict)

        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Cluster {cluster}', fontsize=12)
        ax.axis('off')

    plt.tight_layout()
    plt.savefig(f'cloud_{n_clusters}_clusters.png')
    plt.show()

    return clusters_dict


def custom_tokenizer(text):
    return [token.strip() for token in text.split(', ')]


def clustering(fos_data):
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, token_pattern=None)
    X = vectorizer.fit_transform(fos_data['fos_names'])

    clusters_common_terms = dict()

    for n_clusters in [18, 14, 12, 10]:
        clusters = kmeans_clustering(fos_data, X, n_clusters=n_clusters)
        clusters_common_terms[n_clusters] = clusters
        print()


def plot_word_clouds(start_year, end_year):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
    axes = axes.flatten()

    for i, year in enumerate(range(start_year, end_year)):
        df_year = mag_papers[mag_papers['year'] == year]
        df_year['fos'] = df_year['fos'].apply(ast.literal_eval)

        fos_weight_dict = dict()

        for fos_list in df_year['fos']:
            for fos_dict in fos_list:
                name = fos_dict['name']
                w = fos_dict['w']
                if name in fos_weight_dict:
                    fos_weight_dict[name] += w
                else:
                    fos_weight_dict[name] = w

        wordcloud = WordCloud(width=800,
                              height=400,
                              background_color='white').generate_from_frequencies(fos_weight_dict)

        axes[i].imshow(wordcloud, interpolation='bilinear')
        axes[i].set_title(f'{year}', fontsize=16)
        axes[i].axis('off')

    plt.tight_layout()
    plt.savefig(f'word_clouds_{start_year}_{end_year}.png')
    plt.show()

Run the following code to cluster research papers with K-Means and plot word clouds.

In [None]:
mag_papers = pd.read_csv('mag_papers_combined.csv')
mag_papers = mag_papers.dropna(subset=['id', 'title', 'authors', 'year', 'fos'])
mag_papers['fos_names'] = mag_papers['fos'].apply(get_fos_names)

for year in range(2000, 2020, 2):
    plot_word_clouds(year, year + 2)