# Load the dataset

The `photos.tsv000` file contains metadata for each photo in the dataset. Particularly interesting are the stats about the number of views and downloads for each photo.

In [60]:
import numpy as np
import pandas as pd
import glob

# Read the photos table
dataset_path = './unsplash-dataset/lite'
photos = pd.read_csv(os.path.join(dataset_path, "photos.tsv000"), sep='\t', header=0)

# Print dataset stats
print('Photos count:', photos.shape[0])

Photos count: 25000


# Compute the Downloads/Views ration

The ration between the downloads and the views is computed in % for readability.

In [61]:
# Compute the downloads/views ration (as percentage)
photos["ratio"] = 100 * photos["stats_downloads"] / photos["stats_views"]

# Explore the dataset

## Plot views vs downloads

In [62]:
import plotly.express as px

# Plots the views and downloads on a scatter plot
def plot_views_vs_downloads(data):
    fig = px.scatter(
        data, 
        x="stats_views", 
        y="stats_downloads", 
        hover_data=["photo_image_url", "ratio"],
        labels=dict(stats_views="Views", stats_downloads="Downloads"), 
        title="Downloads vs Views",
        opacity=0.4,
        height=500,
        width=1200,
    )
    return fig

There seem to be some outliers in the data - there are a few photos having more than 40M views

In [63]:
# Plot the Views vs Downloads 
fig = plot_views_vs_downloads(photos)

# Add the 1% line
fig.add_shape(type="line", x0=0, y0=0, x1=160_000_000, y1=1_600_000, line_color="red", line_dash="dash", opacity=0.5)
fig.add_annotation(text="1% downloads/views ratio", x=120_000_000, y=1_200_000, font=dict(color="red", size=14), arrowcolor='red')

Zooming on the area containing the most photos, it seems that there are two download/views ration clusters - around 1% and close to 0% and some outliers.

In [64]:
# Plot the Views vs Downloads zoomed in on the main part
fig = plot_views_vs_downloads(photos[(photos['stats_views'] < 30_000_000) & (photos['stats_downloads'] < 400_000)])

# Add the 1% line
fig.add_shape(type="line", x0=0, y0=0, x1=30_000_000, y1=300_000, line_color="red", line_dash="dash", opacity=0.5)
fig.add_annotation(text="1% downloads/views ratio", x=25_000_000, y=250_000, font=dict(color="red", size=14), arrowcolor='red')

## Inspect the photos

Look at the "best" and "worst" performing photos and their metadata

In [65]:
from IPython.display import Image

# This function is used to show basic information for several photos from the dataset
def show_dataset(photos, photos_to_show):
    i = 1
    for index, photo in photos.iterrows():
        display(Image(url=photo["photo_image_url"], width=200, retina=True))

        print(photo["ratio"], photo["stats_downloads"], photo["stats_views"])
        print(f'Image: {photo["photo_url"]}')
        print(f'Photographer: {photo["photographer_first_name"]} {photo["photographer_last_name"]} ({photo["photographer_username"]})')
        print(f'AI description: {photo["ai_description"]}')
        print()

        i += 1
        if i > photos_to_show:
            break

Filter out photos with very few downloads to avoid skewing the statistics

In [66]:
# Filter the photos that have at least 1000 downloads
photos_filter = photos["stats_downloads"]>1000
print(f"Selected photos: {photos_filter.sum()}, {100*photos_filter.sum()/photos_filter.size}%")

Selected photos: 21290, 85.16%


## Best Photos

In [67]:
# Show photos with BEST ratio
show_dataset(photos[photos_filter].sort_values(by="ratio", ascending=False), 10)

51.55404401157555 117934 228758
Image: https://unsplash.com/photos/PGeslSkvPQg
Photographer: m h (hoppm)
AI description: grayscale photo of flowers



22.11683007802103 328233 1484087
Image: https://unsplash.com/photos/yN9l6LJVBIU
Photographer: Mike Fox (thefoxicon)
AI description: low-angle photography of palm trees



10.530359548810226 9457 89807
Image: https://unsplash.com/photos/3JQbGLh17Gw
Photographer: Wolfgang Hasselmann (wolfgang_hasselmann)
AI description: two black birds



9.788371692894106 36979 377785
Image: https://unsplash.com/photos/8bySAsOHk6M
Photographer: Jolan Wathelet (jowa)
AI description: black and white bird on grass



9.088705115427095 10122 111369
Image: https://unsplash.com/photos/JOFKIzygu70
Photographer: Bit Cloud (bitcloudphotography)
AI description: tipi tent on snowfield near trees during night



7.866245226619374 11000 139838
Image: https://unsplash.com/photos/FygfEHNM1b8
Photographer: Francesco Ungaro (francesco_ungaro)
AI description: green grass photography



7.1147804512823525 38771 544936
Image: https://unsplash.com/photos/CJ1AsPVhtCE
Photographer: Denis Lesak (denislesak)
AI description: Jelly Fish



5.994991758590085 37826 630960
Image: https://unsplash.com/photos/pazLjzzTs94
Photographer: Pascal van de Vendel (pascalvendel)
AI description: nan



5.927813309421076 37945 640118
Image: https://unsplash.com/photos/-N235jQ01v4
Photographer: Johny vino (johnyvino)
AI description: underwater photography of jelly fish



5.591291510391136 39237 701752
Image: https://unsplash.com/photos/ftqmVBz3aPg
Photographer: Jason Leung (ninjason)
AI description: closeup photo of green cactus



## Worst Photos

In [68]:
# Show photos with WORST ratio
show_dataset(photos[photos_filter].sort_values(by="ratio", ascending=True), 10)

0.005309026099182435 1048 19739967
Image: https://unsplash.com/photos/CBa301Yn7F8
Photographer: Jack Church (jackchurch)
AI description: green pine trees on mountain under white clouds during daytime



0.008357257079436497 9136 109318164
Image: https://unsplash.com/photos/XFmznQhx9lM
Photographer: Timothy Eberly (timothyeberly)
AI description: orange leaf trees



0.008452765679944407 9235 109254182
Image: https://unsplash.com/photos/4oovIxttThA
Photographer: Anton Shuvalov (a8ka)
AI description: aerial view of houses near ocean



0.009893890120992566 10993 111108976
Image: https://unsplash.com/photos/3kgiW7ufPEM
Photographer: LOGAN WEAVER (lgnwvr)
AI description: white string light



0.009902118311401684 11011 111198429
Image: https://unsplash.com/photos/RMHF7BY3XXo
Photographer: Michael Baccin (michaelbaccin)
AI description: brown wooden docks



0.010679687874983114 12656 118505336
Image: https://unsplash.com/photos/BkR842UVXqk
Photographer: Olena Sergienko (olenkasergienko)
AI description: pink petaled flower



0.012715441241768715 15060 118438674
Image: https://unsplash.com/photos/GRLN5FC4cLg
Photographer: Will Turner (turner_imagery)
AI description: high angle photography of cliff



0.012794373059571843 2575 20126035
Image: https://unsplash.com/photos/ugA20jqhHL4
Photographer: Daniele Franchi (daniele_franchi)
AI description: snow covered pine trees near frozen lake



0.01580307733365993 17698 111990846
Image: https://unsplash.com/photos/-93ArahrTKc
Photographer: Ivan Bandura (unstable_affliction)
AI description: ocean photography



0.017455118601189933 3496 20028509
Image: https://unsplash.com/photos/A7v5KWxnKQA
Photographer: Dimitar Belchev (belchev)
AI description: tiger street painting



## Downloads/Views Ratio Histograms

In [69]:
# Compute the downloads/views ratio histogram
ratio_histogram, histogram_intervals = np.histogram(
    photos[photos_filter]["ratio"], 
    bins=np.linspace(0,2.5,100),
)

# Plot a histogram
def plot_download_views_histogram(data, histogram_intervals):
    display(px.bar(
        y=data, 
        x=histogram_intervals[1:], 
        labels=dict(x="Download/Views Ratio [%]", y="Number of photos"), 
        title="Downloads/Views Ration Histogram",
        height=300,
        width=1200,
    ))

In [70]:
# Plot the histogram
plot_download_views_histogram(ratio_histogram, histogram_intervals)

In [71]:
# Plot the cumulative histogram
plot_download_views_histogram(np.cumsum(ratio_histogram), histogram_intervals)