In [1]:
import json
import requests
from PIL import Image
import urllib
import pandas as pd
import matplotlib.pyplot as plt

import plotly
from plotly import tools
from plotly.offline import iplot
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

In [2]:
with open('SMP/train_img.txt', 'r') as f:
    imgs = f.read().splitlines()

In [3]:
print('圖片數量', len(imgs))

圖片數量 305613


In [4]:
imgs[:3]

['https://www.flickr.com/photos/58708830@N00/385070026',
 'https://www.flickr.com/photos/97042891@N00/943750056',
 'https://www.flickr.com/photos/18583731@N07/3246928439']

In [5]:
with open('SMP/train_category.json', 'r') as f:
    category = json.load(f)

In [6]:
print('category 數量', len(category))

category 數量 305613


In [7]:
category[:3]

[{'Category': 'Fashion',
  'Concept': 'glam',
  'Pid': '775',
  'Uid': '59@N75',
  'Subcategory': 'Fashion'},
 {'Category': 'Travel&Active&Sports',
  'Concept': 'fifa',
  'Pid': '1075',
  'Uid': '1@N18',
  'Subcategory': 'Soccer'},
 {'Category': 'Entertainment',
  'Concept': 'cinema',
  'Pid': '4890',
  'Uid': '351@N64',
  'Subcategory': 'Movies'}]

In [8]:
category = pd.DataFrame.from_dict(category)

In [9]:
category.head()

Unnamed: 0,Category,Concept,Pid,Subcategory,Uid
0,Fashion,glam,775,Fashion,59@N75
1,Travel&Active&Sports,fifa,1075,Soccer,1@N18
2,Entertainment,cinema,4890,Movies,351@N64
3,Holiday&Celebrations,old,6568,Birthday,6@N59
4,Food,thirsty,7079,Drinks,1617@N40


In [10]:
category.nunique()

Category           11
Concept           668
Pid            305613
Subcategory        77
Uid             38312
dtype: int64

In [11]:
trace = go.Pie(labels=category.Category.value_counts().index, values=category.Category.value_counts().values)
layout = go.Layout(
    title = 'All Categories'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [12]:
category.Concept.value_counts().describe()

count     668.000000
mean      457.504491
std       465.098008
min         1.000000
25%       101.750000
50%       247.500000
75%       740.000000
max      1874.000000
Name: Concept, dtype: float64

In [13]:
trace = go.Bar(x=category.Concept.value_counts().index[:10], y=category.Concept.value_counts().values[:10])
layout = go.Layout(
    title = 'Top 10 Concepts'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [14]:
trace = go.Box(y=category.Concept.value_counts())
layout = go.Layout(
    title = 'Concepts number distribution'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [15]:
category.Subcategory.value_counts().describe()

count       77.000000
mean      3969.000000
std       3248.907164
min         15.000000
25%       1510.000000
50%       3301.000000
75%       5811.000000
max      16390.000000
Name: Subcategory, dtype: float64

In [16]:
sub_cat_hist = category.Subcategory.value_counts()
threshold = sub_cat_hist.quantile(.5)
mask = sub_cat_hist > threshold
others = sub_cat_hist.loc[~mask].sum()
sub_cat_hist = sub_cat_hist.loc[mask]
sub_cat_hist['others']=others

In [17]:
trace = go.Pie(labels=sub_cat_hist.index, values=sub_cat_hist.values)
layout = go.Layout(
    title = 'Top 50% subcategories vs others'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [18]:
with open('SMP/train_tags.json', 'r') as f:
    tags = json.load(f)

In [19]:
print('標籤數量', len(tags))

標籤數量 305613


In [20]:
tags[:3]

[{'Alltags': 'rock punk transgender tranny electronicmusic electro glam electronica luisdrayton fusionrecords thefusionnetwork lmwcphotography',
  'Pid': '775',
  'Uid': '59@N75',
  'Mediatype': 'photo',
  'Title': 'Luis Drayton - Edinburgh shoot #6'},
 {'Alltags': 'brazil rio brasil riodejaneiro by maria fifa military games arena da pan claudio barra 2007 futsal olympicgames lenk claudiolara aqutico copabacana sunsetinrio militaryworldgames brasll brazll estdioolmpicojoohavelange praiasdorio unitedkingdomofengenhodedentro rio2016 clcrio clcbr amanhecernorio claudiol avembaixadorabelardobueno clccam olimpadasmilitares mundialmilitarrio2011 claudiorio atraes carnivalbyclaudio claudoil engenhobyclaudio estdioolmpicojoohavelangebyclaudio barradatijucabyclaudio carnavalbyclaudio rio450 rio450anos flickrbyclaudio claudioarena lapabyclaudio rio2016byclaudio brasil2014byclaudio rio2014byclaudio brazil2014byclaudio csim2011 claudioparque hsbcbyclaudio claudiomundial rlodejaneiro rlodejanelro c

In [21]:
tags = pd.DataFrame.from_dict(tags)

In [22]:
tags.head()

Unnamed: 0,Alltags,Mediatype,Pid,Title,Uid
0,rock punk transgender tranny electronicmusic e...,photo,775,Luis Drayton - Edinburgh shoot #6,59@N75
1,brazil rio brasil riodejaneiro by maria fifa m...,photo,1075,Arena da Barra - Arena HSBC - Arena do PAN #...,1@N18
2,old cinema beauty marilyn photoshop movie joke...,photo,4890,MARILYN 2015,351@N64
3,pictures old family scans brothers sister 1958...,photo,6568,Knikkertijd - 1959,6@N59
4,hot sahara animal animals desert bottles drink...,photo,7079,CAMELS01,1617@N40


In [23]:
tag_dict = {}
for row in tags.Alltags:
    tag_list = row.split()
    for tag in tag_list:
        tag_dict[tag] = tag_dict.get(tag, 0) + 1

In [24]:
top_10_tags = sorted(tag_dict.items(), key=lambda x: x[1], reverse=True)[:10]

In [25]:
sum([tag[1] for tag in top_10_tags]) / sum(tag_dict.values())

0.031081837759672205

In [26]:
trace = go.Pie(labels=[tag[0] for tag in top_10_tags], values=[tag[1] for tag in top_10_tags])
layout = go.Layout(
    title = 'Top 10 tags'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [27]:
trace = go.Pie(labels=tags.Mediatype.value_counts().index, values=tags.Mediatype.value_counts().values)
layout = go.Layout(
    title = 'Media Type'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [28]:
with open('SMP/train_temporalspatial.json', 'r') as f:
    temporalspatial = json.load(f)

In [29]:
print('位置數量', len(temporalspatial))

位置數量 305613


In [30]:
temporalspatial[:3]

[{'Postdate': '1446016778',
  'Uid': '59@N75',
  'Pid': '775',
  'Longitude': '',
  'Geoaccuracy': '0',
  'Latitude': '0'},
 {'Postdate': '1454983379',
  'Uid': '1@N18',
  'Pid': '1075',
  'Longitude': '',
  'Geoaccuracy': '0',
  'Latitude': '0'},
 {'Postdate': '1433118604',
  'Uid': '351@N64',
  'Pid': '4890',
  'Longitude': '',
  'Geoaccuracy': '0',
  'Latitude': '0'}]

In [31]:
temporalspatial = pd.DataFrame.from_dict(temporalspatial)

In [32]:
temporalspatial.head()

Unnamed: 0,Geoaccuracy,Latitude,Longitude,Pid,Postdate,Uid
0,0,0,,775,1446016778,59@N75
1,0,0,,1075,1454983379,1@N18
2,0,0,,4890,1433118604,351@N64
3,0,0,,6568,1451577600,6@N59
4,0,0,,7079,1425744438,1617@N40


In [33]:
temporalspatial[['Geoaccuracy', 'Longitude', 'Latitude']].describe()

Unnamed: 0,Geoaccuracy,Longitude,Latitude
count,305613,305613.0,305613
unique,16,17229.0,17595
top,0,,0
freq,270798,270684.0,270684


In [34]:
temporalspatial.Geoaccuracy.value_counts()[0] / temporalspatial.Geoaccuracy.value_counts().sum()

0.8860814166936616

In [35]:
temporalspatial.Longitude.value_counts()[0] / temporalspatial.Longitude.value_counts().sum()

0.8857083959124775

In [36]:
temporalspatial.Latitude.value_counts()[0] / temporalspatial.Latitude.value_counts().sum()

0.8857083959124775

In [37]:
with open('SMP/train_additional.json', 'r') as f:
    addition = json.load(f)

In [38]:
print('額外數量', len(addition))

額外數量 305613


In [39]:
addition[:3]

[{'Mediastatus': 'ready',
  'Pathalias': 'luisdrayton',
  'Ispublic': '1',
  'Pid': '775',
  'Uid': '59@N75'},
 {'Mediastatus': 'ready',
  'Pathalias': 'claudiolara',
  'Ispublic': '1',
  'Pid': '1075',
  'Uid': '1@N18'},
 {'Mediastatus': 'ready',
  'Pathalias': 'rizzato',
  'Ispublic': '1',
  'Pid': '4890',
  'Uid': '351@N64'}]

In [40]:
addition = pd.DataFrame.from_dict(addition)

In [41]:
addition.head()

Unnamed: 0,Ispublic,Mediastatus,Pathalias,Pid,Uid
0,1,ready,luisdrayton,775,59@N75
1,1,ready,claudiolara,1075,1@N18
2,1,ready,rizzato,4890,351@N64
3,1,ready,4kleuren,6568,6@N59
4,1,ready,glosackmd,7079,1617@N40


In [42]:
trace = go.Pie(labels=addition.Ispublic.value_counts().index, values=addition.Ispublic.value_counts().values)
layout = go.Layout(
    title = 'Ispublic'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [43]:
trace = go.Pie(labels=addition.Mediastatus.value_counts().index, values=addition.Mediastatus.value_counts().values)
layout = go.Layout(
    title = 'Media Status'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [44]:
print(addition.Pathalias.nunique(), '種')

23629 種


In [45]:
with open('SMP/train_label.txt', 'r') as f:
    label = f.read().splitlines()

In [46]:
print('答案數量', len(label))

答案數量 305613


In [47]:
label[:3]

['11.18', '15.15', '10.99']

In [48]:
label = pd.DataFrame.from_dict(label)
label.head()

Unnamed: 0,0
0,11.18
1,15.15
2,10.99
3,8.63
4,11.16


In [49]:
label[0] = label[0].astype('float')
label[0].describe()

count    305613.000000
mean          6.405524
std           2.473008
min           1.000000
25%           4.700000
50%           6.230000
75%           7.980000
max          16.560000
Name: 0, dtype: float64

In [50]:
trace = go.Box(y=label[0])
layout = go.Layout(
    title = 'Answer distribution'
)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)