# Load the dataset

In [1]:
import numpy as np
import pandas as pd
import glob

path = './unsplash-dataset/'
documents = ['photos']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")

    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)

datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

# Add the Download/Views ratio

In [19]:
# Select the relevant columns
photos = datasets['photos']

# Compute the downloads/views ration (as percentage)
photos["ratio"] = 100 * photos["stats_downloads"] / photos["stats_views"]

print(f"Dataset size: {photos.shape}")

Dataset size: (25000, 31)


# Explore the dataset

In [31]:
from IPython.display import Image

def show_dataset(photos, photos_to_show):
    i = 1
    for index, photo in photos.iterrows():
        display(Image(url=photo["photo_image_url"], width=200, retina=True))

        print(photo["ratio"], photo["stats_downloads"], photo["stats_views"])
        print(f'Image: {photo["photo_url"]}')
        print(f'Photographer: {photo["photographer_first_name"]} {photo["photographer_last_name"]} ({photo["photographer_username"]})')
        print(f'Descirption: {photo["photo_description"]}')
        print(f'AI description: {photo["ai_description"]}')
        print()

        i += 1
        if i > photos_to_show:
            break

In [27]:
# Filter the photos
photos_filter = photos["stats_views"]>1000000

print(f"Selected photos: {photos_filter.sum()}, {100*photos_filter.sum()/photos_filter.size}%")

Selected photos: 25000, 100.0%


In [32]:
# Show photos with BEST ratio
show_dataset(photos[photos_filter].sort_values(by="ratio", ascending=False), 10)

51.55404401157555 117934 228758
Image: https://unsplash.com/photos/PGeslSkvPQg
Photographer: m h (hoppm)
Descirption: Have you ever seen petals fall from a tree full of cherry blossoms? It’s like snow, but the fall patterns are more swirly. This photo, I think, captures a lot of subtle motion.
AI description: grayscale photo of flowers



22.11683007802103 328233 1484087
Image: https://unsplash.com/photos/yN9l6LJVBIU
Photographer: Mike Fox (thefoxicon)
Descirption: nan
AI description: low-angle photography of palm trees



10.530359548810226 9457 89807
Image: https://unsplash.com/photos/3JQbGLh17Gw
Photographer: Wolfgang Hasselmann (wolfgang_hasselmann)
Descirption: nan
AI description: two black birds



9.788371692894106 36979 377785
Image: https://unsplash.com/photos/8bySAsOHk6M
Photographer: Jolan Wathelet (jowa)
Descirption: nan
AI description: black and white bird on grass



9.088705115427095 10122 111369
Image: https://unsplash.com/photos/JOFKIzygu70
Photographer: Bit Cloud (bitcloudphotography)
Descirption: nan
AI description: tipi tent on snowfield near trees during night



7.866245226619374 11000 139838
Image: https://unsplash.com/photos/FygfEHNM1b8
Photographer: Francesco Ungaro (francesco_ungaro)
Descirption: nan
AI description: green grass photography



7.1147804512823525 38771 544936
Image: https://unsplash.com/photos/CJ1AsPVhtCE
Photographer: Denis Lesak (denislesak)
Descirption: nan
AI description: Jelly Fish



5.994991758590085 37826 630960
Image: https://unsplash.com/photos/pazLjzzTs94
Photographer: Pascal van de Vendel (pascalvendel)
Descirption: This beautifull small animal( half an inch) lives in Azia, I did several attempts to get a nice picture of this animal but it is so hard to find one and to make a nice picture it,s even harder. But this time my diveguide found one at 25 meters dept while diving in the Philipinnes. Our divecomputer told us that we had 9 minutes to take the picture. The first minutes this seahorse was moving all the times and could not focus on him. Then he curled his tiny tail around the coral and i made some shots. With 1 minute to go we went up and finally i had my picture.
AI description: nan



5.927813309421076 37945 640118
Image: https://unsplash.com/photos/-N235jQ01v4
Photographer: Johny vino (johnyvino)
Descirption: nan
AI description: underwater photography of jelly fish



5.591291510391136 39237 701752
Image: https://unsplash.com/photos/ftqmVBz3aPg
Photographer: Jason Leung (ninjason)
Descirption: nan
AI description: closeup photo of green cactus



In [33]:
# Show photos with WORST ratio
show_dataset(photos[photos_filter].sort_values(by="ratio", ascending=True), 10)

0.005309026099182435 1048 19739967
Image: https://unsplash.com/photos/CBa301Yn7F8
Photographer: Jack Church (jackchurch)
Descirption: BC weather.
AI description: green pine trees on mountain under white clouds during daytime



0.008357257079436497 9136 109318164
Image: https://unsplash.com/photos/XFmznQhx9lM
Photographer: Timothy Eberly (timothyeberly)
Descirption: Fall color in the countryside of Eastern Washington
AI description: orange leaf trees



0.008452765679944407 9235 109254182
Image: https://unsplash.com/photos/4oovIxttThA
Photographer: Anton Shuvalov (a8ka)
Descirption: nan
AI description: aerial view of houses near ocean



0.009893890120992566 10993 111108976
Image: https://unsplash.com/photos/3kgiW7ufPEM
Photographer: LOGAN WEAVER (lgnwvr)
Descirption: PORTRAITS INSTAGRAM - @LGNWVRPRTRTS  EDITORIAL INSTAGRAM - @LGNWVRPHTO  PERSONAL INSTAGRAM - @LGNWVR
AI description: white string light



0.009902118311401684 11011 111198429
Image: https://unsplash.com/photos/RMHF7BY3XXo
Photographer: Michael Baccin (michaelbaccin)
Descirption: Walking in the summer times
AI description: brown wooden docks



0.010679687874983114 12656 118505336
Image: https://unsplash.com/photos/BkR842UVXqk
Photographer: Olena Sergienko (olenkasergienko)
Descirption: nan
AI description: pink petaled flower



0.012715441241768715 15060 118438674
Image: https://unsplash.com/photos/GRLN5FC4cLg
Photographer: Will Turner (turner_imagery)
Descirption: nan
AI description: high angle photography of cliff



0.012794373059571843 2575 20126035
Image: https://unsplash.com/photos/ugA20jqhHL4
Photographer: Daniele Franchi (daniele_franchi)
Descirption: Dobbiaco Lake  👋 Small donation, huge appreciation paypal.me/DanieleFranchi 🙏🙏🙏
AI description: snow covered pine trees near frozen lake



0.01580307733365993 17698 111990846
Image: https://unsplash.com/photos/-93ArahrTKc
Photographer: Ivan Bandura (unstable_affliction)
Descirption: The cliffs
AI description: ocean photography



0.017455118601189933 3496 20028509
Image: https://unsplash.com/photos/A7v5KWxnKQA
Photographer: Dimitar Belchev (belchev)
Descirption: nan
AI description: tiger street painting



In [7]:
import plotly.express as px

fig = px.scatter(photos, x="stats_views", y="stats_downloads", hover_data=["photo_image_url", "ratio"])
fig.show()

# Save the photos list

In [None]:
# Save a list of the photos
