RQ2 - What are the current Docker image tagging practices?
==============

# Dependencies and Configurations

## Import Dependencies

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import itertools
from datetime import datetime
import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
import matplotlib.ticker as ticker
import psycopg2
import os, gc
import re

In [2]:
from pandarallel import pandarallel

## Database Configurations

In [3]:
POSTGRESQL_USER = os.getenv('POSTGRESQL_USER') if os.getenv('POSTGRESQL_USER') is not None else 'dockerstudy'
POSTGRESQL_PASSWORD = os.getenv('POSTGRESQL_PASSWORD') if os.getenv('POSTGRESQL_PASSWORD') is not None else 'dockerstudy'
POSTGRESQL_HOST_IP = os.getenv('POSTGRESQL_HOST_IP') if os.getenv('POSTGRESQL_HOST_IP') is not None else 'localhost'
POSTGRESQL_PORT = os.getenv('POSTGRESQL_PORT') if os.getenv('POSTGRESQL_PORT') is not None else '5432'
POSTGRESQL_DATABASE = os.getenv('POSTGRESQL_DATABASE') if os.getenv('POSTGRESQL_DATABASE') is not None else 'dockerstudy'

## Load the relevant tables

In [4]:
conn = psycopg2.connect(database=POSTGRESQL_DATABASE, user=POSTGRESQL_USER, password=POSTGRESQL_PASSWORD, host=POSTGRESQL_HOST_IP, port=POSTGRESQL_PORT)
Tags = pd.read_sql_query('select * from public.tag', con = conn)
Images = pd.read_sql_query('select * from public.image', con = conn)
Repositories = pd.read_sql_query('select * from public.repository', con = conn)
conn.close()

In [6]:
print('Number of tags: ', Tags.shape[0])
print('Number of Docker images with tags:', len(Tags['image_name'].unique()))

Number of tags:  30896696
Number of Docker images with tags: 2988676


## Results

In [6]:
image_tags_count = Tags.groupby(by='image_name')['id'].count()
images_with_one_tag = pd.Series(image_tags_count[image_tags_count==1].index)
images_with_latest_tag = pd.Series(Tags[['image_name', 'tag_name']].query("tag_name=='{}'".format('latest'))['image_name'].unique())
print('Proportion of Docker images that have only one default latest tag: ', images_with_one_tag[images_with_one_tag.isin(images_with_latest_tag)].shape[0] / len(Tags['image_name'].unique()))

Proportion of Docker images that have only one default latest tag:  0.504849973700729


In [7]:
name_of_images_with_selfdefined_tag = pd.Series(Tags[['image_name', 'tag_name']].query("tag_name!='{}'".format('latest'))['image_name'].unique())
print('Number of Docker images that have self-defined tags:', name_of_images_with_selfdefined_tag.shape[0])

Number of Docker images that have self-defined tags: 1479833


In [8]:
images_with_selfdefined_tags = Tags[Tags['image_name'].isin(name_of_images_with_selfdefined_tag)].copy()[['image_name', 'tag_name']]
name_of_images_with_selfdefined_tags_without_latest_tag = images_with_selfdefined_tags.groupby(by='image_name').apply(lambda x: True if 'latest' in x['tag_name'].values else False)
print('Proportion of Docker images with self-defined tags that do not have a latest tag: ', name_of_images_with_selfdefined_tags_without_latest_tag[name_of_images_with_selfdefined_tags_without_latest_tag==False].shape[0] / name_of_images_with_selfdefined_tags_without_latest_tag.shape[0])

Proportion of Docker images with self-defined tags that do not have a latest tag:  0.7049545455466935


In [9]:
images_with_multiple_selfdefined_tags = Tags[Tags['image_name'].isin(name_of_images_with_selfdefined_tag[name_of_images_with_selfdefined_tag.isin(pd.Series(image_tags_count[image_tags_count>=2].index))])].copy()[['image_name', 'tag_name', 'last_updated']]
print('Number of Docker images that have multiple tags', len(images_with_multiple_selfdefined_tags['image_name'].unique()))

Number of Docker images that have multiple tags 757349


In [74]:
def get_latest_tag_lag_days(x):
    x.index=range(0, len(x))
    if 'latest' in x['tag_name'].values:
        try:
            return (x['last_updated'][0] - x['last_updated'][x[x['tag_name'] == 'latest'].index[0]]).days
        except:
            return None
    else:
        return None
    
def get_latest_tag_lag_versions(x):
    x.index=range(0, len(x))
    if 'latest' in x['tag_name'].values:
        try:
            return x[x['tag_name'] == 'latest'].index[0]
        except:
            return None
    else:
        return None

In [75]:
sorted_images_with_multiple_selfdefined_tags = images_with_multiple_selfdefined_tags.copy()
sorted_images_with_multiple_selfdefined_tags = sorted_images_with_multiple_selfdefined_tags.dropna()
sorted_images_with_multiple_selfdefined_tags = sorted_images_with_multiple_selfdefined_tags.sort_values(by='last_updated', ascending=False)
pandarallel.initialize()
latest_tag_lag_days = sorted_images_with_multiple_selfdefined_tags.copy().groupby(by='image_name').parallel_apply(get_latest_tag_lag_days)
pandarallel.initialize()
latest_tag_lag_versions = sorted_images_with_multiple_selfdefined_tags.copy().groupby(by='image_name').parallel_apply(get_latest_tag_lag_versions)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [114]:
print('Proportion of Docker images that have multiple self-defined tags, but the tag does not actually point to the lastest version of the image: ', latest_tag_lag_versions[latest_tag_lag_versions>1].shape[0]/latest_tag_lag_versions.shape[0])

Proportion of Docker images that have multiple self-defined tags, but the tag does not actually point to the lastest version of the image:  0.11387659783677483


In [56]:
print('Proportion of Docker images that there are at least 5 released versions beyond the version the latest tag points to: ', latest_tag_lag_versions[latest_tag_lag_versions>=5].shape[0]/latest_tag_lag_versions.shape[0])

Proportion of Docker images that there are at least 5 released versions beyond the version the latest tag points to:  0.039400837929118034


In [116]:
print('Proportion of Docker images that have a latest tag pointing to an image that was updated more than 3 months before the current most recent version of the image: ', latest_tag_lag_days[latest_tag_lag_days>=90].shape[0]/latest_tag_lag_days.shape[0])

Proportion of Docker images that have a latest tag pointing to an image that was updated more than 3 months before the current most recent version of the image:  0.04411418921807533


In [8]:
pandarallel.initialize()
Repositories['branch_count']=Repositories['branches'].parallel_apply(lambda x: len(x) if type(x)==list else None)
id_of_repo_with_releases_and_tags = Repositories[Repositories['tags'].notnull()]['repo_id'].append(Repositories[Repositories['releases'].notnull()]['repo_id']).append(Repositories[Repositories['branch_count']>1]['repo_id']).copy()
id_of_repo_with_releases_and_tags.index = range(0, id_of_repo_with_releases_and_tags.shape[0])
images_whose_repo_has_tags_or_releases = Images[Images['source_repo_id'].isin(id_of_repo_with_releases_and_tags)].copy()
images_whose_repo_has_tags_or_releases = images_whose_repo_has_tags_or_releases[['image_name', 'created_at', 'updated_at', 'source_repo_id']]
images_whose_repo_has_tags_or_releases.index = range(0, images_whose_repo_has_tags_or_releases.shape[0])

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
targeted_Repositories = Repositories[Repositories['repo_id'].isin(id_of_repo_with_releases_and_tags)]
targeted_Tags = Tags[Tags['image_name'].isin(images_whose_repo_has_tags_or_releases['image_name'])][['image_name', 'tag_name']]

In [12]:
pandarallel.initialize()
images_whose_repo_has_tags_or_releases['repo_tags'] = images_whose_repo_has_tags_or_releases['source_repo_id'].parallel_apply(lambda x: targeted_Repositories.query('repo_id=={}'.format(x))['tags'].values[0])

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [13]:
pandarallel.initialize()
images_whose_repo_has_tags_or_releases['repo_branches'] = images_whose_repo_has_tags_or_releases['source_repo_id'].parallel_apply(lambda x: targeted_Repositories.query('repo_id=={}'.format(x))['branches'].values[0])

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [14]:
pandarallel.initialize()
images_whose_repo_has_tags_or_releases['image_tags'] = images_whose_repo_has_tags_or_releases['image_name'].parallel_apply(lambda x: list(targeted_Tags.query("image_name=='{}'".format(x))['tag_name']))

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [52]:
def check_tag_release_naming(x):
    repo_tags = [item['name'] for item in x['repo_tags']] if x['repo_tags'] is not None else []
    repo_branches = [item['name'] for item in x['repo_branches']] if x['repo_branches'] is not None else []
    image_tags = x['image_tags'] if x['image_tags'] is not None else []
    try:
        if(list((set(repo_tags) & set(image_tags)) | (set(repo_branches) & set(image_tags)))) != []:
            return True
        else:
            return False
    except:
        return None

def check_sha_naming(x):
    try:
        if len(x)>=40 and x.isalnum():
            return True
        else:
            return False
    except:
        return False

In [22]:
pandarallel.initialize()
images_whose_repo_has_tags_or_releases['tag_release_naming'] = images_whose_repo_has_tags_or_releases.parallel_apply(check_tag_release_naming, axis=1)
images_whose_repo_has_tags_or_releases['create_year'] = images_whose_repo_has_tags_or_releases['created_at'].parallel_apply(lambda x:x.year if x is not None else None)
images_whose_repo_has_tags_or_releases['update_year'] = images_whose_repo_has_tags_or_releases['updated_at'].parallel_apply(lambda x:x.year if x is not None else None)
Images['create_year'] = Images['created_at'].parallel_apply(lambda x:x.year if x is not None else None)
Images['update_year'] = Images['updated_at'].parallel_apply(lambda x:x.year if x is not None else None)

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [73]:
print('Number of Docker images that follow the release tagging practices:', images_whose_repo_has_tags_or_releases[images_whose_repo_has_tags_or_releases['tag_release_naming']==True].shape[0])

Number of Docker images that follow the release pipeline versioning practices: 65509


In [54]:
pandarallel.initialize()
SHA_naming = Tags['tag_name'].parallel_apply(check_sha_naming)

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [74]:
print('Number of Docker images that follow the commit/digest SHA pinning practices:', len(Tags[SHA_naming]['image_name'].unique()))

Number of Docker images that follow the commit/digest SHA pinning practices: 35136
