# DeMetRA - review figures

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

## Is-processed dataset

In [2]:
assets_directory = '../assets/'

lit = pd.read_csv(f'{assets_directory}MPS_literature_cleaned.csv')
d = pd.read_csv(f'{assets_directory}MPS_base_target.csv')

In [None]:

demetra_pallete = {
    'caribbean-current': '#00635dff',
    'vista-blue': '#809bceff',
    'xanthous': '#ffc145ff',
    'fairy-tale': '#eac4d5ff',
    'red-ncs': '#b20d30ff',
    'jasmine': '#ffd685ff',
    'yinmn-blue': '#395993ff',
    'light-sea-green': '#00b8abff',
    'quinacridone-magenta': '#873157ff',
    'lavender-blush': '#feecf0ff'}


### Pie charts

In [None]:
def pie_count(ax, var, df=d):

    count = df.value_counts(var, dropna=False)
    labels = count.index
    sizes = count
    
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', colors=demetra_pallete.values())


def base_target_overlap(var, df=d):
    ovlp = df[f'{var}_base'] == df[f'{var}_targ']
    return(f'{round((sum(ovlp) / len(ovlp))*100)}%')


def compare_base_target(var_list, df=d):

    fs = dict(big=30, small=15)

    row_targ = list()
    row_base = list()
    row_ovlp = list()

    for i in range(len(var_list)+1):  
        row_targ.append(f'A{i}')
        row_base.append(f'B{i}')
        row_ovlp.append(f'C{i}')
    
    fig, axs = plt.subplot_mosaic([row_targ, row_base, row_ovlp], figsize=(25, 13), 
                                  height_ratios=[1,1,.5], width_ratios=[.2]+[1]*len(var_list))
    

    label_txt = dict(x=.5, y=.5, ha='center', va='center', fontsize=fs['big'], rotation=90)
    
    axs['A0'].text(s='Target', transform=axs['A0'].transAxes, **label_txt)
    axs['B0'].text(s='Base', transform=axs['B0'].transAxes, **label_txt)
    axs['C0'].text(s='Match', transform=axs['C0'].transAxes, **label_txt)

    [axs[i].axis('off') for i in ['A0', 'B0', 'C0']]

    for i, var in enumerate(var_list):
        
        pie_count(axs[f'A{i+1}'], f'{var}_targ')
        pie_count(axs[f'B{i+1}'], f'{var}_base')

        axs[f'C{i+1}'].text(x=.5, y=.5, s=base_target_overlap(var), fontsize=fs['big'], ha='center', va='center', 
                           transform=axs[f'C{i+1}'].transAxes)
        axs[f'C{i+1}'].axis('off')

        axs[f'A{i+1}'].set_title('\n'.join(var.split('_')), fontsize=fs['big'], fontweight='bold') 


compare_base_target(var_list = ['Tissue', 'Array','Ancestry','Developmental_period'])

### Sankey plots

In [None]:
def sankey(ax, var, left_labels, right_labels, d=d, left='targ', right='base',
           title_left='Target', title_right='Base', spacer=10, fss={'sm': 14, 'l': 15, 'xl': 25}):
    
    counts = pd.DataFrame(d[[f'{var}_{left}',f'{var}_{right}']].value_counts(dropna=False)).reset_index()
    
    total = counts['count'].sum()
    
    def size_esimator(label_dict, side):
    
        size_list = list()
        
        for label in label_dict.keys():
            label_count = int(counts.loc[counts[f'{var}_{side}']==label, 'count'].sum())
            label_dict[label]['size'] = label_count
            size_list.append(label_count)
    
        cumulative_sum = np.cumsum(size_list).tolist()
        
        top_pos = [0] + [c+(spacer*(i+1))for i,c in enumerate(cumulative_sum[:-1])]
        bottom_pos = [c+(spacer*(i))for i,c in enumerate(cumulative_sum)]
    
        for i, label in enumerate(label_dict.keys()):
            label_dict[label]['top'] = top_pos[i]
            label_dict[label]['bottom'] = bottom_pos[i]
        
        return label_dict
    
    left_dict = size_esimator(left_labels, side=left)
    right_dict = size_esimator(right_labels, side=right)
    
    def label_y(label_dict, label):
        if label_dict[label]['size'] > spacer:
            y = label_dict[label]['top']+1
            va='top'
        else:
            y = label_dict[label]['top'] + label_dict[label]['size']*0.5
            va='center'

        string_spacer = '\n' if label_dict[label]['size'] > 5 else ' '
        percent_count = round(label_dict[label]['size'] / total * 100)
        percent_count = percent_count if percent_count > 0 else '<1'
        
        s = f"{label}{string_spacer}({percent_count}%)"
        
        return dict(y=y, s=s, va=va)
        
    # Draw left counts
    for label in left_dict.keys():
        ax.fill_between(x=[0, 1], y1=left_dict[label]['top'], y2=left_dict[label]['bottom'], 
                        color=left_dict[label]['color'], edgecolor=None)
        ax.text(x=-0.1, **label_y(left_dict, label), ha='right', fontsize=fss['sm'])
    
    # Draw right counts
    for label in right_dict.keys():
        ax.fill_between(x=[9, 10], y1=right_dict[label]['top'], y2=right_dict[label]['bottom'], 
                        color=right_dict[label]['color'], alpha=1, edgecolor=None)
        ax.text(x=10.1, **label_y(right_dict, label), ha='left', fontsize=fss['sm'])

    # Add titles on each side
    titlespecs = dict(y=-10, va='center',ha='center', fontweight='bold', fontsize=fss['l'])
    ax.text(x=0.5, s=title_left, **titlespecs)
    ax.text(x=9.5, s=title_right, **titlespecs)
    
    # Draw strips 
    for left_label in left_dict.keys():
        
        for right_label in right_dict.keys():
            
            strip_color = left_dict[left_label]['color'] # Color strip according to the left side
            
            strip_size = counts.loc[(counts[f'{var}_{left}']==left_label) & (counts[f'{var}_{right}']==right_label), 'count']
    
            
            if  len(strip_size) > 0:
                strip_size = int(strip_size.iloc[0])
    
                # Create array of y values for each strip, half at left value, half at right, convolve
                ys_d = np.array(50 * [left_dict[left_label]['top']] + 50 * [right_dict[right_label]['top']])
                ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid')
                ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid')
                
                ys_u = np.array(50 * [left_dict[left_label]['top'] + strip_size] + 50 * [right_dict[right_label]['top'] + strip_size])
                ys_u = np.convolve(ys_u, 0.05 * np.ones(20), mode='valid')
                ys_u = np.convolve(ys_u, 0.05 * np.ones(20), mode='valid')
    
                # Update bottom edges at each label so next strip starts at the right place
                left_dict[left_label]['top'] += strip_size
                right_dict[right_label]['top'] += strip_size
                
                ax.fill_between(np.linspace(1, 9, len(ys_d)), ys_d, ys_u, alpha=0.4, color=strip_color, edgecolor=None)
    
    largest_count = max(left_dict[list(left_dict.keys())[-1]]['bottom'], right_dict[list(right_dict.keys())[-1]]['bottom'])
    ax.set_xlim(-0.1, 10.1)
    ax.set_ylim(-10, largest_count+10)
    ax.invert_yaxis()
    ax.axis('off')

    # Add superior title
    ax.set_title(' '.join(var.split('_')), fontweight='bold', fontsize=fss['xl'], pad=25)
    
    # Also return overall overla 
    color_counts = counts.copy()
    color_counts[f'{var}_{left}'] = [left_labels[i]['color'] for i in counts[f'{var}_{left}']]
    color_counts[f'{var}_{right}'] = [right_labels[i]['color'] for i in counts[f'{var}_{right}']]
    
    match = int(color_counts.loc[(color_counts[f'{var}_{left}'] == color_counts[f'{var}_{right}']) & (color_counts[f'{var}_{left}'] != 'grey'), 
                'count'].sum())
    match_percent = match / total * 100

    return match_percent

def display_match(ax, match, fs=22): 
    ax.text(x=.5, y=.95, s=f'Match: {round(match)}%',fontsize=fs, ha='center', va='center', transform=ax.transAxes)
    ax.axis('off')

In [None]:
# ['Tissue', 'Array','Ancestry','Developmental_period']
def get_counts(var):
    counts = pd.DataFrame(d[[f'{var}_targ',f'{var}_base']].value_counts(dropna=False)).reset_index()
    print(counts[f'{var}_targ'].unique())
    print(counts[f'{var}_base'].unique())
    return counts

get_counts('Developmental_period')

In [None]:
fig, axs = plt.subplot_mosaic('AB;ab;CD;cd', figsize=(20, 30),
                              height_ratios=[1,.2, 1,.2], gridspec_kw=dict(hspace=0, wspace=0.7))
    

a = sankey(axs['A'], var='Array', 
           left_labels = {'450K': {'color': 'darkgreen'}, 
                      'EPICv1': {'color': 'mediumpurple'},
                      'Multiple (450K, EPICv1)': {'color': 'orange'},
                      'Multiple (450K, GMEL (~3000 CpGs from EPICv1))': {'color': 'orange'},
                      'Multiple (450K, EPICv2)': {'color': 'orange'}},
           right_labels = {'450K': {'color': 'darkgreen'}, 
                       'EPICv1': {'color': 'mediumpurple'}, 
                       'Multiple (450K, EPICv1)': {'color': 'orange'}})

display_match(axs['a'], a)

b = sankey(axs['B'], var='Tissue',
           left_labels = {'Peripheral blood': {'color':'crimson'},
                          'Whole blood': {'color':'crimson'},
                          'Dried bloodspot': {'color':'crimson'},
                          'Blood-clots': {'color':'crimson'},
                          'Cord blood': {'color':'darkred'},
                          'Saliva': {'color':'lightblue'},
                          'Buccal cells': {'color':'teal'},
                          'Tumour cells': {'color':'orange'},
                          'Not reported': {'color':'grey'}}, 
           right_labels = {'Peripheral blood': {'color':'crimson'},
                           'Whole blood': {'color':'crimson'},
                           'Cord blood': {'color':'darkred'},
                           'Multiple (Cord blood, Dried bloodspot)': {'color':'crimson'},
                           'Multiple (Cord blood, Whole blood)': {'color':'crimson'},
                           'Multiple (Whole blood, HPCs)': {'color':'crimson'},
                           'Leukocytes': {'color':'mediumpurple'},
                           'Tumour cells': {'color':'orange'}})

display_match(axs['b'], b)

c = sankey(axs['C'], var='Ancestry',
           left_labels = {'White': {'color':'pink'}, 
                          'European': {'color':'pink'}, 
                          'Mixed': {'color':'purple'}, 
                          'Hispanic': {'color': 'orange'},
                          'African': {'color':'crimson'},
                          'Not reported': {'color': 'grey'}}, 
           right_labels = {'White': {'color':'pink'},
                           'European': {'color':'pink'},
                           'Mixed': {'color':'purple'}, 
                           'Hispanic': {'color': 'orange'},
                           'Not reported': {'color':'grey'}})

display_match(axs['c'], c)


dp = sankey(axs['D'], var='Developmental_period',
           left_labels = {'Birth': {'color':'darkblue'}, 
               'Very early childhood': {'color':'#4132d4'}, 
               'Mid childhood': {'color':'#7566ff'}, 
               'Late childhood': {'color':'#beb7ff'}, 
               'Childhood and adolescence': {'color':'#f0cdff'}, 
               'Adolescence': {'color':'purple'}},
           right_labels = {'Birth': {'color':'darkblue'}, 
                'Mid childhood': {'color':'#7566ff'}, 
                'Late childhood': {'color':'#beb7ff'}, 
                'Childhood': {'color':'blue'},
                'Childhood and adolescence': {'color':'#f0cdff'}, 
                'Birth, Childhood and adolescence': {'color':'#7b07d0'},
                'Adolescence': {'color':'purple'},
                'Adults':{'color':'teal'},
                'Not reported': {'color':'grey'}})

display_match(axs['d'], dp)

fig.savefig('Figure3.pdf', dpi=400, bbox_inches='tight')

In [None]:
# d_targ.groupby(d_targ.What_is_available).Identifier_base.value_counts(dropna=False)

### Author network

In [3]:
# import networkx as nx
import plotly.graph_objects as go
import textwrap
import pickle

In [4]:
# Read the graph object from file
with open(f'{assets_directory}/Publications_network.pkl', 'rb') as file:
    G = pickle.load(file)


In [40]:
import networkx as nx
edges_df = nx.to_pandas_edgelist(G)

# Display the DataFrame
edges_df = edges_df.loc[edges_df.source.str.startswith('Author'), ]

pub_count = edges_df.value_counts('source')
print(len(pub_count), 'unique authors')

# Reduce to only those with more than one publication
pub_count_filtered = pub_count[pub_count > 1].reset_index(drop=False)

pub_count_filtered['Author'] = pub_count_filtered.source.apply(lambda x: x.split('/')[-1])

import plotly.express as px

fig = px.histogram(pub_count_filtered, x="Author", y='count')

fig.update_layout(
    title='Number of publications per author',
    width=1300,
    height=500,
    xaxis_title='Author',
    yaxis_title='Number of publications',
    showlegend=False
)
fig.update_xaxes(tickangle=65, tickmode='linear')

fig.show()

268 unique authors


#### My nexwork


In [None]:
from scholarly import scholarly

seed_author = 'Serena Defina'
G = nx.Graph()
search_query = scholarly.search_author(seed_author)
author = scholarly.fill(next(search_query))

for p in author['publications']:            
    publication_filled = scholarly.fill(p)
    bib = publication_filled['bib']        
    title = bib['title'].replace(':', ' ')    
    title = f"Paper/{title}"        
    authors = bib['author'].split(' and ')    
    for author in authors:        
        author = author.replace('.', '')        
        author = '\n'.join([n for n in author.split(' ') if len(n) > 1])        
        author = f"Author/{author}"
        G.add_edge(author, title)

In [None]:
#pos = nx.nx_pydot.graphviz_layout(G, prog='sfdp')
pos = nx.spring_layout(G)

max_len = max([len(n) for n in G.nodes() if n.startswith('Author/')])

plt.figure(figsize=(30, 30))
nx.draw_networkx_nodes(G, pos, 
    nodelist=[n for n in G.nodes() if n.startswith('Author/') & (n != 'Author/Serena\nDefina')],
    node_color='skyblue',
    node_size=max_len*100)

nx.draw_networkx_nodes(G, pos, 
    nodelist=[n for n in G.nodes() if n == 'Author/Serena\nDefina'],
    node_color='darkblue',
    node_size=max_len*200)

nx.draw_networkx_nodes(G, pos,
    nodelist=[n for n in G.nodes() if n.startswith('Paper/')],
    node_color='darkgreen',
    node_size=1000)

nx.draw_networkx_edges(G, pos)
nx.draw_networkx_labels(G, pos,
    labels={n: n.split('/')[-1].replace(' ', '') for n in G.nodes() if n.startswith('Author/')},
    font_color='w',
    font_weight='bold')

plt.axis('off')

# Other figures 

In [None]:
def n_scores_by_pub(color_by="Category"):
    # Color set-up
    if color_by in color_maps.keys():
        color_map = color_maps[color_by]
    else: 
        color_map = "Virdis"
    
    fig = px.histogram(data,
                       x="Title", color=color_by, color_discrete_map = color_map, 
                       nbins=186, # Number of papers
                       # hover_name=data.Title.apply( lambda t: "<br>".join(textwrap.wrap(t, width=80))), 
                       # hover_data=["Sample size","Journal"],
                       # category_orders = {"Category": list(data["Category"].value_counts().index)},
                       title="Number of MPSs by publication", width=1400, height=500).update_xaxes(categoryorder="total ascending")
    # Make it pretty 
    axes_style = dict(mirror=True, showline=True, linecolor='black')
    fig.update_yaxes(title_text='<b>Number of MPSs</b>', gridcolor='lightgrey', ticks='outside', **axes_style, 
                     tickmode = 'array',
                     tickvals = [1, 5, 10, 20, 30, 40, 50, 70, 100])
    fig.update_xaxes(title_text='<b>Publication</b>', **axes_style, 
                     showticklabels=False)
    fig.update_layout(plot_bgcolor='whitesmoke', margin=dict(l=10, r=10, t=25, b=10))
    
    fig.show()


n_scores_by_pub()
n_scores_by_pub("Tissue")
n_scores_by_pub("Array")

In [None]:
# Create fictional age data
age_encoding = {
    "Birth": [0, 0, 0],
    "Very early childhood": [1, 0, 2],
    "Early childhood": [3, 2, 4],
    "Mid childhood": [5, 4, 7],
    "Late childhood": [9, 8, 11],
    "Childhood": [5, 1, 11],
    "Childhood and adolescence": [13, 10, 16],
    "Adolescence": [14, 12, 18],
    "Not reported": [20, 20, 20]
}

data[["Age_median","Age_min","Age_max"]] = [age_encoding[l] for l in data["Developmental period"]]

# Plot
fig = px.scatter(data, x="Age_median", y="Tissue", size=data["Sample size"].fillna(0), size_max = 70,
                 error_x=data["Age_max"] - data["Age_median"], error_x_minus=data["Age_median"] - data["Age_min"],
                 hover_name=data.Title.apply( lambda t: "<br>".join(textwrap.wrap(t, width=80))),
                 color="Category", color_discrete_map = category_color_map)

# Make it pretty 
fig.update_traces(marker=dict(opacity = .5))
axes_style = dict(mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_yaxes(title_text='<b>Tissue</b>', **axes_style)
fig.update_xaxes(title_text='<b>Age of the sample</b> (years) {median, range}', **axes_style, 
                 tickmode = 'array',
                 tickvals = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                 ticktext = ["Birth", 2, 4, 6, 8, 10, 12, 14, 16, 18, "Not reported"])
fig.update_layout(plot_bgcolor='whitesmoke', width=1400, height=600, margin=dict(l=10, r=10, t=25, b=10))

fig

In [None]:
# Plot
fig = px.histogram(data, x="Number of CpGs", color="Category", # log_y=True, # barnorm="percent",
                   # animation_frame="Number of CpGs bins", # animation_group="Category",
                   color_discrete_map = category_color_map, nbins=200,
                   category_orders = {"Category": list(data["Category"].value_counts().index)},
                   title="Number of CpGs by category", width=1400, height=700)

# Make it pretty 
axes_style = dict(mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_yaxes(title_text='<b>Count</b>', **axes_style)
fig.update_xaxes(title_text='<b>Number of CpGs</b>', **axes_style)
                 # tickmode = 'array',
                 # tickvals = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
                 # ticktext = ["1", "1k", "2k", "3k", "4k", "5k", "6k", "7k", "8k", "9k", ">10k"])

# Add range slider
fig.update_layout(
    plot_bgcolor='whitesmoke',
    xaxis=dict(
        # autorange=False,
        # range = [0, 1000], 
        rangeslider=dict(visible=True, 
                         autorange=False,
                         range=[0,1000]),
        type="linear"
    )
)

In [None]:
# Clean up: TMP
data["Number of CpGs"] = pd.to_numeric(data["Number of CpGs"], errors='coerce')
data["Number of CpGs crop"] = [10_000 if i > 10_000 else i for i in data["Number of CpGs"]]
data["Number of CpGs bins"] = pd.cut(data["Number of CpGs"],
                                     bins=[0, 1_000, 5_000, 10_000, 160_000], include_lowest=True,
                                     labels=['< 1k', '1 to 5k', '5 to 10k', "> 10k"])

# Plot
fig = px.histogram(data, x="Number of CpGs", color="Category", # log_y=True, # barnorm="percent",
                   animation_frame="Number of CpGs bins", # animation_group="Category",
                   color_discrete_map = category_color_map, nbins=150,
                   category_orders = {"Category": list(data["Category"].value_counts().index)},
                   title="Number of CpGs by category", width=1400, height=700)

# Make it pretty 
axes_style = dict(mirror=True, ticks='outside', showline=True, linecolor='black', gridcolor='lightgrey')
fig.update_yaxes(title_text='<b>Count</b>', **axes_style)
fig.update_xaxes(title_text='<b>Number of CpGs</b>', **axes_style)
                 # tickmode = 'array',
                 # tickvals = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
                 # ticktext = ["1", "1k", "2k", "3k", "4k", "5k", "6k", "7k", "8k", "9k", ">10k"])
fig.update_layout(plot_bgcolor='whitesmoke')

fig

In [None]:
fig = px.scatter(data, x="Category", y="Number of CpGs", animation_frame="Year", animation_group="Title",
                 color="Tissue", color_discrete_map = tissue_color_map,
                 size=data["Sample size"].fillna(0), 
                 hover_name="Title", facet_col="Array",
                 width=1400, height=650,
                 title="Bit of everything"
                 )
           # log_x=True, size_max=45, range_x=[100,100000], range_y=[25,90])
fig

In [None]:
fig = px.scatter(data, x="Number of CpGs", y="Effect size phenotype", size=data["Sample size"].fillna(0), 
                 color="Tissue", hover_name="Title")
           # log_x=True, size_max=45, range_x=[100,100000], range_y=[25,90])
fig

In [None]:
fig = px.bar_polar(pd.DataFrame(data[["Category", "Tissue"]].value_counts()).reset_index(), 
                   r="count", theta="Category", color="Tissue", width=900, height=900) #, template="plotly_dark", #  line_close=True,
            # color_discrete_sequence=px.colors.sequential.Plasma_r)
fig

# Abstract analysis 

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# text = bib.Abstract

text = " ".join(abstract for abstract in bib.Abstract.fillna(""))
print ("There are {} words in the combination of all review.".format(len(text)))


In [None]:
# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["CpG","CpGs","DNA","DNAm","methylation","episignature","episignatures","epigenetic","gene","genes","genetic","variant","variants",
                  "score","scores","result","results","analysis","study","studies","cohort","sample","n","p","CI","data","significant",
                  "background","conclusion","copyright","model","finding","method","methods",
                  "associated","association","associations","effect","effects","related","identify","identified","provide","assessed","using","used","based",
                  "specific","distinct","one","two","three", "may","including","within",
                  "patient","patients","individual","individuals","subject", "case","cases"])

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image
fig, ax = plt.subplots(figsize=(5, 5), facecolor='lightskyblue', layout='constrained')
# the matplotlib way:
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off");

In [None]:
def wordcould_by_category(category, extra_stopwords=[]):
    subcat = bib.loc[bib.Identifier.isin([int(i) for i in pd.unique(data.loc[data.Category==category, "Identifier"])])]
    
    text = " ".join(abstract for abstract in subcat.Abstract.fillna(""))

    # Create stopword list:
    stopwords = set(STOPWORDS)
    stopwords.update(["CpG","CpGs","DNA","DNAm","methylation","episignature","episignatures","epigenetic","gene","genes","genetic","variant","variants",
                      "score","scores","result","results","analysis","study","studies","cohort","sample","n","p","CI","data","significant",
                      "background","conclusion","copyright","model","finding","method","methods",
                      "associated","association","associations","effect","effects","related","identify","identified","provide","assessed","using","used","based",
                      "specific","distinct","one","two","three", "may","including","within",
                      "patient","patients","individual","individuals","subject", "case","cases"]+extra_stopwords)
    
    # Generate a word cloud image
    wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
    
    # Display the generated image
    fig, ax = plt.subplots(figsize=(10, 5), facecolor=category_color_map[category], layout='constrained')
    # the matplotlib way:
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(category, fontsize=18, fontweight="bold")
    ax.text(1,1.1, "Based on {} words from {} abstracts.".format(len(text.split(" ")), subcat.shape[0]), fontsize=12,
            ha="right", transform=ax.transAxes)
    ax.axis("off");

for c in data.Category.unique(): wordcould_by_category(c)

# LLM playground

In [None]:
from llama_cpp import Llama
# model source: https://huggingface.co/TheBloke/orca_mini_v3_7B-GGUF 
# Bits: 4	Size: 4.08 GB	Max RAM required: 6.58 GB
LLM = Llama(model_path="../orca_mini_v3_7b.Q4_K_M.gguf", n_ctx=2048) # medium, balanced quality - recommended # max_new_tokens = 4096, max_tokens = 2000,

In [None]:
# create a text prompt
prompt = f'Based on the following abstract, what was the sample size of the study and what statistical or machine-learning and what method was used: \n\n"{bib.Abstract[10]}"'

# generate a response (takes several seconds)
output = LLM(prompt, max_tokens=2000)

In [None]:
# display the response
print(prompt, "\n\n----", output["choices"][0]["text"])