In [3]:
import numpy as np
import pandas as pd
import glob

# This code is adapted from the examples provided with the datase
path = './unsplash-dataset/lite/'
documents = ['photos', 'keywords']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")

    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)

    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [12]:
keywords = datasets['keywords'].copy()
keywords["suggested_by_user"] = keywords["suggested_by_user"].apply(lambda x: int(x == 't'))
keywords["is_nature"] = keywords["keyword"].apply(lambda x: int(x == "nature"))

keyword_aggs = dict(
    keyword="count",
    ai_service_1_confidence="count",
    ai_service_2_confidence="count",
    is_nature="sum",
    suggested_by_user="sum",
)

keywords_summary = keywords.groupby("photo_id").agg(keyword_aggs)

keywords_summary['suggested_by_ai'] = keywords_summary['keyword'] - keywords_summary['suggested_by_user']
# keywords_summary['photo_id'] = keywords_summary.index

display(keywords_summary)

Unnamed: 0_level_0,keyword,ai_service_1_confidence,ai_service_2_confidence,is_nature,suggested_by_user,suggested_by_ai
photo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
--2IBUMom1I,142,128,10,1,10,132
--6JlGcHl-w,117,111,10,0,0,117
--Jy_8mvs4E,123,101,10,0,19,104
--SDX4KWIbA,145,136,10,1,5,140
--Tn3E5ZtfQ,40,7,10,1,29,11
...,...,...,...,...,...,...
zzW47lWoaAA,139,129,10,1,10,129
zzWCZDaeSRw,118,108,10,1,10,108
zzboOsdmQkY,127,123,10,0,0,127
zzi-6FCQtF8,124,118,10,1,0,124


In [19]:
import plotly.express as px

display(px.scatter(
    keywords_summary, 
    x="suggested_by_ai", 
    y="suggested_by_user", 
    # hover_name="photo_id",
    color="is_nature",
    opacity=0.3,
    color_continuous_scale=['#ff0000', '#00ff00'],
))

In [14]:
import plotly.express as px
import numpy as np

# Compute the downloads/views ratio histogram
keywords_histogram, histogram_intervals = np.histogram(
    keywords_summary, 
    bins=np.linspace(0,200,20)
)

# Plot the histogram
display(px.bar(
    y=keywords_histogram, 
    x=histogram_intervals[1:], 
    height=300,
    width=1500,
))

In [9]:
photos = datasets['photos']
photos["ratio"] = 100 * photos["stats_downloads"] / photos["stats_views"]
photos = photos[["photo_id", "ratio", "stats_views", "stats_downloads"]]

# Merge
photos = photos.merge(keywords_summary, left_on='photo_id', right_on='photo_id')

photos = photos[photos["ratio"] <= 3]


display(px.scatter(photos, x="ratio", y="suggested_by_user"))





ValueError: 'photo_id' is both an index level and a column label, which is ambiguous.