In [None]:
import numpy as np
import scipy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from pybaselines import Baseline, utils
import os
import re
from matplotlib.colors import LogNorm

In [None]:
#Function: Find All Tags
#This function accepts the dataframe of the full Tag library as input along with a list of tag types. 
#It then outputs a list of lists, where each sublist contains every tag of that type.
#All tags are escaped to avoid matching issues with special characters that could be included in the data

def findAllTags(Taglibrary, TagTypes = ['Backbone:', 'Bond:', 'Property:', 'Application:', 'Source:']):
    #create an empty list for each tag type in AllTags
    AllTags = [[] for _ in range(len(TagTypes))]

    #iterate over every entry in the Tag Library
    for index, row in Taglibrary.iterrows():
        #check if the value at Manual Tags is a string, if not, skip to next row (no data)
        if isinstance(row['Manual Tags'], str):
            #split string into each individual tag, which are separated by ;
            rowTags = row['Manual Tags'].split('; ')
            for tag in rowTags:
                for counter, tagType in enumerate(TagTypes):
                    #for each tag, check if it belongs to the tag type (if it includes the tag type string as a substring)
                    if tagType in tag:
                        #create an updated list of all previous tags
                        seen = AllTags[counter]
                        #add the new tag (escaped format) ONLY if it is unique
                        if not any(re.escape(tag) in x for x in seen):
                            AllTags[counter].append(re.escape(tag))
    
    #return an array of all tags for each tag type
    return AllTags



#Function: Slice By Tag
#This function accepts the dataframe of the full Tag library as input along with a list of ANDtags, a list of ORtags, and a list of NOTtags. 
#It outputs a new dataframe that is a slice of the original dataframe where every entry contains ALL of the ANDtags, AT LEAST ONE of the ORtags, and NONE of the NOTtags.
#All tags used in ANDtags and NOTtags should be escaped to avoid matching issues with special characters
def sliceByTag(Taglibrary, AND_tags = [], OR_tags = [], NOT_tags = []):
    df_withORTag = None
    df_new = Taglibrary.copy(deep = True)
    for tag in AND_tags:
        df_withTag = Taglibrary[Taglibrary['Manual Tags'].str.contains(tag, na=False)]
        df_new = df_new.merge(df_withTag, how = 'inner')

    for tag in OR_tags:
        if df_withORTag is None:
            df_withORTag = Taglibrary[Taglibrary['Manual Tags'].str.contains(tag, na=False)]
        else:
            df_withTag = Taglibrary[Taglibrary['Manual Tags'].str.contains(tag, na=False)]
            df_withORTag = df_withORTag.merge(df_withTag, how = 'outer')
    
    if df_withORTag is not None:
        df_new = df_new.merge(df_withORTag, how = 'inner')

    for tag in NOT_tags:
        df_withoutTag = Taglibrary[~Taglibrary['Manual Tags'].str.contains(tag, na=False)]
        df_new = df_new.merge(df_withoutTag, how = 'inner')
    return df_new

#Function: Count Publications and Citations by Year
#This function takes in a Tag library (can be sliced) and outputs a new dataframe with the number of publications and number of citations for each year present in the library in ascending order.
#The outputted dataframe has the Publication Year as the Index and two columns: Number of Publications and Number of Citations
def CountPubsandCitationsbyYear(Taglibrary):
    df = Taglibrary.groupby('Publication Year').count()
    df['Number of Citations'] = Taglibrary.groupby('Publication Year')['Citations'].sum()
    df = df[['Author','Number of Citations']]
    df = df.rename(columns = {'Author':'Number of Publications'})
    return df


#Generate a dataframe and csv file for every possible tag pair and the number of papers and citations that have both, save it, and then plot a heat map
def tag_pair_analysis(Taglibrary, TagType1, TagType2, save = True, savename = False, plot = True):
    list = []
    if TagType1 == 'Backbone':
        Tags1 = Backbones
    elif TagType1 == 'Bond':
        Tags1 = Bonds
    elif TagType1 == 'Property':
        Tags1 = Properties
    elif TagType1 == 'Application':
        Tags1 = Applications
    else:
        raise('Incorrect Tag Type inputted')

    if TagType2 == 'Backbone':
        Tags2 = Backbones
    elif TagType2 == 'Bond':
        Tags2 = Bonds
    elif TagType2 == 'Property':
        Tags2 = Properties
    elif TagType2 == 'Application':
        Tags2 = Applications
    else:
        raise('Incorrect Tag Type inputted')
    
    for tag1 in Tags1:
        for tag2 in Tags2:
            #slice Tag
            sliced_df = sliceByTag(Taglibrary, AND_tags = [tag1,tag2])
            num_match = len(sliced_df)
            num_citations = sliced_df['Citations'].sum()
            list.append([tag1.replace('\\', '').replace(TagType1 + ': ', ''),tag2.replace('\\', '').replace(TagType2 + ': ', ''),num_match, num_citations])
    df = pd.DataFrame(list, columns=[TagType1, TagType2, 'Number of Matches', 'Number of Citations'])
    df = df.sort_values('Number of Matches', ascending = False)

    if save:
        df.to_csv(TagType1 + ' ' + TagType2 + ' pairs.csv')
#         if savename:
#             df.to_csv(savename + '.csv')
#         else:
#             df.to_csv(TagType1 + ' ' + TagType2 + ' pairs.csv')

    if plot:
        df = df.pivot(index = TagType1, columns = TagType2, values = 'Number of Matches')
        df = df.reindex(df.sum(axis = 1).sort_values(ascending=False).index, axis=0)
        df = df.reindex(df.sum(axis = 0).sort_values(ascending=False).index, axis=1)
        df['Total Per ' + TagType1] = df.sum(axis = 1)
        df.loc['Total Per ' + TagType2] = df.sum(axis = 0)
        
        
        df_values = df.copy()
        df_values['Total Per ' + TagType1] = np.nan
        df_values.loc['Total Per ' + TagType2] = np.nan
#         df_values.replace(0, np.nan)
        display(df_values)
        
        df_sums = df.copy()
        df_sums.iloc[:-1,:-1] = np.nan
#         df_values.replace(0, np.nan)
        display(df_sums)
        
        fig, ax = plt.subplots(figsize=(20, 20))
        sns.heatmap(ax = ax, data = df_values, annot = True, annot_kws={'size': 8}, cmap='Blues', linewidths=0.5, norm=LogNorm(), fmt='g', vmin=0.1, cbar = False, mask=df_values.isna())
        sns.heatmap(ax = ax, data = df_sums, annot = True, annot_kws={'size': 8}, cmap='Reds', linewidths=0.5, norm=LogNorm(), fmt='g', vmin=0.1, cbar = False, mask=df_sums.isna())
        if savename:
            plt.savefig(savename + ' heatmap.png')
        else:
            plt.savefig(TagType1 + ' ' + TagType2 + ' heatmap.png')
        plt.show()

#outputs number of citations and papers for each single tag
def single_tag_analysis(Taglibrary, filename = False):
    list = []
    for group in AllTags:
        for tag in group:
            #slice Tag
            sliced_df = sliceByTag(Taglibrary, AND_tags = [tag])
            num_match = len(sliced_df)
            num_citations = sliced_df['Citations'].sum()
            list.append([tag.replace('\\', ''),num_match, num_citations])
    df = pd.DataFrame(list, columns=['Tag', 'Number of Matches', 'Number of Citations'])
    if filename:
        df.to_csv(filename + '.csv')
    else:
        df.to_csv('Single Tag Analysis.csv')

In [None]:
#Input basepath that goes to tag library data in .csv format
data_filepath = "FINAL_Dynamic-Polymer-Annotated-Library-DPAL.csv"

#Read in exported tag library information into dataframe
Taglibrary = pd.read_csv(data_filepath, sep = None, engine='python')

#Parse out the citation count for each row
Taglibrary[['Citations', 'Extra Split']] = Taglibrary['Extra'].str.split(' ', n=1, expand=True)

#Force all entries to be numeric or NaN
Taglibrary['Citations'] = Taglibrary['Citations'].apply(pd.to_numeric, errors='coerce')

#Delete Extra Split column
Taglibrary = Taglibrary.drop('Extra Split', axis=1)

#Generate a list of all tags found in tag library and assign to each tag type
AllTags = findAllTags(Taglibrary)
Backbones = AllTags[0]
Bonds = AllTags[1]
Properties = AllTags[2]
Applications = AllTags[3]
Source = AllTags[4]

In [None]:
#Get Number of Publications and Citations by Year for the entire tag library
df = CountPubsandCitationsbyYear(Taglibrary)
print(df['Number of Publications'].sum())
print(df['Number of Citations'].sum())
df.to_csv('Total Publications and Citations by Year.csv')

#Get Number of Publications and Citations by Year for each Application
for application in Applications:
    sliced_df = sliceByTag(Taglibrary, AND_tags = [application])
    df = CountPubsandCitationsbyYear(sliced_df)
    df.to_csv('Application- ' + application.replace('\\', '').replace('Application: ', '') + ' Publications and Citations by Year.csv')

#Get Number of Publications and Citations by Year for each Property
for property in Properties:
    sliced_df = sliceByTag(Taglibrary, AND_tags = [property])
    df = CountPubsandCitationsbyYear(sliced_df)
    df.to_csv('Property- ' + property.replace('\\', '').replace('Property: ', '') + ' Publications and Citations by Year.csv')

In [None]:
#Heatmap generation
tag_pair_analysis(Taglibrary, TagType1 = 'Bond', TagType2 = 'Property', save = True, plot = True)

In [None]:
#Case Study Analysis

#Case Study 2
Glass_Transition_Only = sliceByTag(Taglibrary,AND_tags = ['Property: Glass Transition'])
Ionic_Conductivity_Only = sliceByTag(Taglibrary,AND_tags = ['Property: Ionic Conductivity'])
Glass_and_Ionic = sliceByTag(Taglibrary,AND_tags = ['Property: Glass Transition','Property: Ionic Conductivity'])
print(len(Glass_Transition_Only))
print(len(Ionic_Conductivity_Only))
print(len(Glass_and_Ionic))

single_tag_analysis(Glass_Transition_Only, filename = 'glass only single tag analysis')
single_tag_analysis(Ionic_Conductivity_Only, filename = 'ionic only single tag analysis')
single_tag_analysis(Glass_and_Ionic, filename = 'glass and ionic single tag analysis')

#Case Study 3
Biocompatible_SoftRobotics = sliceByTag(Taglibrary,AND_tags = ['Property: Biocompatibility','Application: Soft Robotics'])
print(len(Biocompatible_SoftRobotics))
tag_pair_analysis(Biocompatible_SoftRobotics, TagType1 = 'Backbone', TagType2 = 'Bond', save = True, savename = 'Biocompatible_SoftRobotics', plot = True)
