In [9]:
import pandas as pd
import numpy as np
import altair as alt
import eco_style
alt.themes.enable('dark')

ThemeRegistry.enable('dark')

In [12]:
dfs = pd.read_html("https://www.educatingsilicon.com/2024/05/09/how-much-llm-training-data-is-there-in-the-limit/")

In [13]:
df = dfs[0]
df.columns = df.iloc[0]
df = df[1:]
df = df.iloc[[1,2], :4]
df.columns = ['name', 'words', 'tokens', 'rel_size']
df['is_llm'] = True

src_df = dfs[1]
src_df.columns = src_df.iloc[0]
src_df = src_df[1:]
src_df.columns = ['name'] + list(src_df.columns[1:])

src_df = src_df.iloc[:, :4]
src_df.columns = ['name', 'words', 'tokens', 'rel_size']
src_df['is_llm'] = False

names_to_keep = ['Twitter / X', 'Every unique book', 'All podcasts', 'Academic articles']
src_df = src_df[src_df['name'].isin(names_to_keep)]

df = df.reset_index(drop=True)
src_df = src_df.reset_index(drop=True)

df = pd.concat([df, src_df])
df

def parse_num(x):
    if "billion" in x:
        return float(x.split()[0]) /1000
    if "trillion" in x:
        return float(x.split()[0])
    
df['Words'] = df['words'].apply(parse_num)

df

Unnamed: 0,name,words,tokens,rel_size,is_llm,Words
0,Llama 3,11 trillion,15T,1.0,True,11.0
1,GTP-4,5 trillion,6.5T,0.5,True,5.0
0,Academic articles,800 billion,1T,0.07,False,0.8
1,Every unique book,16 trillion,21T,1.4,False,16.0
2,Twitter / X,8 trillion,11T,0.7,False,8.0
3,All podcasts,560 billion,0.75T,0.05,False,0.56


In [14]:
df.name

0              Llama 3
1                GTP-4
0    Academic articles
1    Every unique book
2          Twitter / X
3         All podcasts
Name: name, dtype: object

In [32]:
alt.themes.enable("dark")

chart = alt.Chart(df).mark_bar(color="rgb(54,184,180)").encode(
    y=alt.Y('name:N', title=None, type='nominal', axis=alt.Axis(labelExpr='datum.value + ""'),  sort='-x'),
    x=alt.X('Words:Q', title='', axis=alt.Axis(labelExpr='datum.label + "T"')),
    opacity = alt.Opacity('is_llm:N', scale=alt.Scale(domain=[True, False], range=[1, 0.5]), legend=None),    
).properties(
    background='rgba(0,0,0,0)',
    height=200,
    width=1000
)

chart.save("llm_comp.png", scale_factor=3)
                                               


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
WARN Channel opacity should not be used with an unsorted discrete field.


# BIG Data

In [None]:
d = [
    {
        "size": 1,
        "label": "1MB",
        "desc": "An XLSX of GDP subcomponents",
    },
    {
        "size": 15,
        "label": "15MB",
        "desc": "Prices recorded by the ONS in one month"
    },
    {
        "size": 260000,
        "label": "260GB",
        "desc": "All price metadata (Davies et al)"
    },
    {
        "size": 
    }
]

# Storage

In [43]:
df = pd.read_csv("storage.csv")
df = df.drop(columns=['Entity', 'Code'])
df.columns = [c.replace("Historical price of", "") for c in df.columns]
df.columns = [c.title() for c in df.columns]
df = df.melt(id_vars=['Year'], var_name='series', value_name='value')
df = df.dropna()

df['label'] = np.where(df.Year == df.groupby('series')['Year'].transform('max'), df['series'], '')

df['Year'] = pd.to_datetime(df['Year'], format='%Y')

base = alt.Chart(df).encode(
    x=alt.X('Year:T', title=''),
    y=alt.Y('value:Q', title='Price per GB', scale=alt.Scale(type='log'), axis=alt.Axis(labelExpr="format(datum.value, ',.0f') + ' $/GB'")),
    color=alt.Color('series:N', title='', legend=None),
)

lines = base.mark_line()

labels = base.mark_text(align='left', dx=5).encode(
    text='label'
)

chart = lines + labels
chart = chart.properties(
    width=800,
    height=400,
    background='rgba(0,0,0,0)'
)

chart.save("storage.png", scale_factor=3)  
chart.save("storage.json")
chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


# Internet

In [57]:
df = pd.read_csv("number-of-internet-users.csv")
df = df.drop(columns=['Code'])



df = df.melt(id_vars=['Year', 'Entity'], var_name='series', value_name='value')
df = df.dropna()

df.columns = ['date', 'region', 'series', 'value']

df['date'] = pd.to_datetime(df['date'], format='%Y')

df['label'] = np.where(df.date == df.groupby('region')['date'].transform('max'), df['region'], '')

base = alt.Chart(df).encode(
    x=alt.X('date:T', title=''),
    y=alt.Y('value:Q', title='Number of users', axis=alt.Axis(labelExpr="format(datum.value, ',.0f')")),
    color=alt.Color('region:N', title='', legend=None),
    strokeDash=alt.condition(alt.datum.region == 'World', alt.value([1, 0]), alt.value([4, 2])),
)

lines = base.mark_line()

labels = base.mark_text(align='left', dx=5).encode(
    text='label'
)

chart = lines + labels

chart = chart.properties(
    width=800,
    height=400,
    background='rgba(0,0,0,0)'
)

chart.save("internet_users.png", scale_factor=3)
chart.save("internet_users.json")

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
WARN strokeDash dropped as it is incompatible with "text".
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Unnamed: 0,Year,Entity,series,value
0,1990,Africa,Number of Internet users,0
1,1991,Africa,Number of Internet users,5434
2,1992,Africa,Number of Internet users,16232
3,1993,Africa,Number of Internet users,48970
4,1994,Africa,Number of Internet users,112387
...,...,...,...,...
209,2016,World,Number of Internet users,3256215846
210,2017,World,Number of Internet users,3483746104
211,2018,World,Number of Internet users,3795522333
212,2019,World,Number of Internet users,4194081858


In [47]:
df

Unnamed: 0,name,words,tokens,rel_size,is_llm,Words
0,Llama 3,11 trillion,15T,1.0,True,11.0
1,GTP-4,5 trillion,6.5T,0.5,True,5.0
0,Academic articles,800 billion,1T,0.07,False,0.8
1,Every unique book,16 trillion,21T,1.4,False,16.0
2,Twitter / X,8 trillion,11T,0.7,False,8.0
3,All podcasts,560 billion,0.75T,0.05,False,0.56


In [12]:
df

Unnamed: 0,NaN,Training Set (Words),Training Set (Tokens),Relative size (Llama 3 = 1),NaN.1,NaN.2,NaN.3
2,Llama 3,11 trillion,15T,1.0,,,
3,GTP-4,5 trillion,6.5T,0.5,,,
