The existence and mechanisms of Chess addiction have been an everlasting question, with forum posts mentioning it on Lichess as early [as in 2014](https://lichess.org/forum/general-chess-discussion/online-chess-addiction), but even predates the online chess' era as this [Chicago reader's 1988 demonstrates](https://chicagoreader.com/news-politics/addicted-to-chess/).


Chess addiction has always been a recurring subject online, and while its formal existence should be left to rigourous studies, it has lead me to wonder more generally about how players spent their time playing on Lichess, which we will delve into.

The data was aggregated from the January 2023 Lichess rated games (freely available on the [Lichess database](https://database.lichess.org)). Games shorter than 4 plies were filtered out. For each account (which from now on I will consider equivalent to one user), I kept their their username, as well as for each time control:
*   Number of games
*   Average rating
*   Approximate time spent (in seconds), which is based on the exact time control only, and [used by Lichess](https://lichess.org/faq#time-controls) to differenciate between blitz/bullet/etc: (**clock initial time in seconds) + 40 × (clock increment)**.
*   Real time spent (in seconds), based on clocks. Does not take into account the use of the `+15s` button, which was considered negligible.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Define global style of the plt for all graphs
plt.style.use("dark_background")
plt.rcParams['axes.facecolor'] = "#444445"
plt.rcParams["figure.facecolor"] = "#333"
plt.rcParams["grid.color"] = "#fff"
plt.rcParams["grid.linestyle"] = "-."
plt.rcParams["grid.linewidth"] = 0.2

DFT_HEIGHT = 4.8 # inches
DFT_WIDTH = 6.4 # inches

perfs = ["ultrabullet", "bullet", "blitz", "rapid", "classical"]
colors = [ "#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd","#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf"]
COLOR_MAP = { "ultrabullet": "#fdb462", "bullet": "#81b1d2", "blitz": "#fa8174", "rapid":"#b3de69", "classical":"#bfbbd9"}

In [None]:
dtypes = {"username": "string"}

for perf in perfs:
    dtypes[f"{perf}_approximate_time"] = "Int64"
    dtypes[f"{perf}_real_time"] = "Int64"
    dtypes[f"{perf}_avg_rating"] = "Int64"

def restrict_to_perf(perf: str, with_error = True):
    l = ["username", f"{perf}_games", f"{perf}_avg_rating",f"{perf}_approximate_time", f"{perf}_real_time"]
    if with_error:
        l.append(f"{perf}_relative_error")
    return l

#df = pd.read_csv("time-spent.csv",dtype=dtypes)
df = pd.read_csv("time-spent-2023-01.csv",dtype=dtypes)
df.dtypes

In [None]:
df_time = df.copy()

def convert(c):
    df_time[c] = df_time[c].apply(lambda x: pd.to_timedelta(x,unit="S"))

for perf in perfs:
    convert(f"{perf}_approximate_time")
    convert(f"{perf}_real_time")
df_time

Here is an extract from the data

In [None]:
print(df_time[df_time["bullet_games"] > 2].iloc[:2][restrict_to_perf("bullet",False)].to_markdown())

|    | username            |   bullet_games |   bullet_avg_rating | bullet_approximate_time   | bullet_real_time   |
|---:|:--------------------|---------------:|--------------------:|:--------------------------|:-------------------|
|  0 | ange_de_la_mode     |             85 |                1430 | 0 days 01:32:20           | 0 days 02:26:59    |
|  5 | Ankit_khandelwal_07 |            214 |                1316 | 0 days 03:39:00           | 0 days 05:33:30    |

The total data has ~~2,073,933~~ 2,073,569 (removing bots) distinct users!

In [None]:
df_time.describe()

## A quick glance

A simple first approach is to check how is time played dispatched between the different time controls.

In [None]:
def pie_per_column(df, column: str):
    res = []
    for p in perfs:
        res.append(df[column.format(p)].sum())
    return res

games = pie_per_column(df,"{}_games")
approx_time = pie_per_column(df,"{}_approximate_time")
real_time = pie_per_column(df,"{}_real_time")

fig, axs = plt.subplots(ncols=3,figsize=(DFT_WIDTH * 2.5, DFT_HEIGHT))
axs[0].pie(games,labels=perfs,colors=COLOR_MAP.values(),autopct="%d%%")
axs[0].set_title("Number of games per Time Control")
axs[1].pie(real_time,labels=perfs,colors=COLOR_MAP.values(),autopct="%d%%")
axs[1].set_title("Real time played per Time Control")
axs[2].pie(approx_time,labels=perfs,colors=COLOR_MAP.values(),autopct="%d%%")
axs[2].set_title("Approximate time played per Time Control")

As expected the number of games alone is not a good appreciation of how much total time is spent playing. The approximate time formula is also quite close to the truth.

In [None]:
df1 = df[restrict_to_perf("bullet",False)].copy()
df1.sort_values("bullet_real_time", ascending=False, inplace=True)
df1.iloc[:20]

Now that we have a first overview of the time spent by time control, it's interesting to consider how 

In [None]:
fig, axs = plt.subplots(ncols=2,nrows=2,figsize=(DFT_WIDTH * 2, DFT_HEIGHT * 2))
axs = axs.flatten()
def hist(df, ax, data, color,perf,log: bool):
    ax.hist(df[data["column"]].astype("float64"),bins=100,color=color,log=log,label=perf)
    ax.set_title(data["title"])
    #ax.set_xlim(left=0,right=data.get("xlim"))

def perf_hist(df, axs, perf: str,include_avg=False,log=False):
    datas = [{
        "column": f"{perf}_games",
        "title": "number of games",
        "xlim": 2000,
    }]
    if include_avg:
        datas.append({
       "column": f"{perf}_avg_rating",
       "title": "average rating",
        })
    datas.extend([
    {
        "column": f"{perf}_approximate_time",
        "title": "approximate time",
        "xlim": 100_000,
    },{
        "column": f"{perf}_real_time",
        "title": "approximate time",
        "xlim": 100_000,
    }])
    for (ax, data) in zip(axs, datas):
        hist(df,ax,data,COLOR_MAP[perf],perf,log=log)

perf_hist(df1, axs, "bullet",include_avg=True)
fig.suptitle(f"Bullet distribution by")

The distribution of number of games, and time seem to follow a heavy tail distribution, while the rating distribution is following the expected the normal law. A big surprise compared to the [online distribution](https://lichess.org/stat/rating/distribution/bullet) is the importance of new players that is represented by the big spike around 1500.

Let's see if the tail distribution is shared by the Time Control, and use a **logarithm** y-axis this time.

In [None]:
fig, axs = plt.subplots(ncols=3,nrows=len(perfs),figsize=(DFT_WIDTH * 2.5, DFT_HEIGHT * len(perfs)))

for perf, axs_row in zip(perfs, axs):
    perf_hist(df, axs_row,perf,log=True)
    axs_row[0].legend()
fig.suptitle(f"Distribution by Time Control of",y=0.92,size="x-large")

While this overview clearly demonstrate all follow the same law, the lack of common scale make it difficult to compare them.

In [None]:
distribs = [np.histogram(df[f"{perf}_real_time"].dropna(),bins=10000) for perf in perfs]
distrib_plot = []
for distrib in distribs:
    # we want to compute the average of the bucket
    average_x = []
    for i in range(0,len(distrib[1]) - 1):
        average_x.append((distrib[1][i] + distrib[1][i+1]) / 2)
    # remove last value which is still a bucket edge
    # now go from y,x to x,y
    distrib_plot.append([average_x, distrib[0]])

for perf, plot_line in zip(perfs, distrib_plot):
    plt.loglog(plot_line[0], plot_line[1],color=COLOR_MAP[perf],label=perf)
plt.legend()

In [None]:
# try to see if by excluding the tail, we can get a more normal-like distrib
column = "bullet_real_time"
df[df[column] < df[column].quantile(0.50)][column].hist(bins=100)
# Seems like no, it's a tail-like distrib

In [None]:
# Compute the relative error of the real time spent, compared to the average computed one
df_avg_error = df.copy()
for perf in perfs:
    df_avg_error[f"{perf}_relative_error"] = (df[f"{perf}_approximate_time"] - df[f"{perf}_real_time"]).abs() / df[f"{perf}_real_time"]

# Restricting to blitz to see all columns
df_avg_error[restrict_to_perf("blitz")]

In [None]:
avg_errors = [f"{p}_relative_error" for p in perfs]
def filter_relative_error(df, perfs):
    return [df[df[f"{c}_relative_error"] < df[f"{c}_relative_error"].quantile(0.99)][f"{c}_relative_error"] for c in perfs]

def better_boxplot(ax, df, perfs):
    data = filter_relative_error(df, perfs)
    colors = [COLOR_MAP[x] for x in perfs]
    bplot = ax.boxplot(data,patch_artist=True,sym='',labels=perfs)
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)

    
fig, axs = plt.subplots(ncols=2,nrows=1,figsize=(DFT_WIDTH * 1.5, DFT_HEIGHT),gridspec_kw={"width_ratios": [4, 1]})
better_boxplot(axs[0],df_avg_error,perfs[:-1])
better_boxplot(axs[1],df_avg_error,perfs[-1:])

In [None]:
df_avg_error[[f"{p}_relative_error" for p in perfs]].describe(percentiles=[.5, .75,.90,.95,.99])

In [None]:
# Investing why the max error is so high

max_avg_error_first = df_avg_error.sort_values("blitz_relative_error", ascending=False)
max_avg_error_first.query("blitz_games>=10")[restrict_to_perf("blitz")][:20]

In [None]:
# Only show players with more than x games
x = 30
for (perf, color) in zip(perfs, colors):
    more_x_games = df_avg_error.query(f"{perf}_games>{x}").copy()
    print(f"{more_x_games.shape[0]} players with more than {x} {perf} games")
    more_x_games.hist(bins=100,column=f"{perf}_relative_error",color=color)

In [None]:
# Scatter plot, x rating, y time-spent
for (perf, color) in zip(perfs, colors):
    df.plot.scatter(x=f"{perf}_avg_rating",y=f"{perf}_real_time",color=color,s=1)

In [None]:
# Weighted histogram. Total time per rating
fig, axs = plt.subplots(nrows=len(perfs), figsize=(6.4, 4.8 * len(perfs)))
for (perf, color, ax) in zip(perfs, colors, axs):
    ax.hist(df[f"{perf}_avg_rating"],bins=100,color=color,weights=df[f"{perf}_real_time"])
    ax.set_title(f"{perf} rating distribution, weighted by time played")