<a href="https://colab.research.google.com/github/leyixu21/master_thesis/blob/main/clean_flickr_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparations

In [2]:
! pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 4.8 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 52.6 MB/s 
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 52.2 MB/s 
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1


In [3]:
from google.colab import drive
drive.mount('/content/drive')

path='/content/drive/MyDrive/master_thesis_data/'

Mounted at /content/drive


In [51]:
import pandas as pd
from pandas.io.formats.format import NA

import folium

import geopandas as gpd

# Preprocess Flick Data

## Check Data Quality

In [5]:
# read boundary data
london = gpd.read_file(path+'statistical-gis-boundaries-london/ESRI/MSOA_2004_London_High_Resolution.shp')

# convert the coordination system to wgs84
london = london.to_crs('epsg:4326')
london.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [82]:
output_geojson = 'london_flickr_'
y = 2021

gdf = gpd.read_file(path+output_geojson+str(y)+'.geojson')

In [7]:
gdf.columns

Index(['Unnamed: 0', 'accuracy', 'context', 'datetaken',
       'datetakengranularity', 'datetakenunknown', 'farm', 'geo_is_contact',
       'geo_is_family', 'geo_is_friend', 'geo_is_public', 'height_n', 'id',
       'isfamily', 'isfriend', 'ispublic', 'latitude', 'license', 'longitude',
       'machine_tags', 'owner', 'place_id', 'secret', 'server', 'tags',
       'title', 'url_n', 'views', 'width_n', 'woeid', 'geometry'],
      dtype='object')

In [83]:
# select necessary columns
gdf = gdf[['id', 'Unnamed: 0', 'owner', 'accuracy', 'context', 'datetaken', 'datetakenunknown', 'farm', 'tags', 'title', 'geometry']]

# convert string to datetime format
gdf['datetaken'] = pd.to_datetime(gdf['datetaken'], format='%Y-%m-%d %H:%M:%S')
# sort by datetaken
gdf = gdf.sort_values(by='datetaken')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [84]:
# remove records with no tags
gdf = gdf.dropna(subset=['tags'])

# get data with accuracy larger than 14
gdf = gdf[gdf['accuracy']>=14]

In [85]:
gdf

Unnamed: 0.1,id,Unnamed: 0,owner,accuracy,context,datetaken,datetakenunknown,farm,tags,title,geometry
0,50786422771,363,50256734@N05,16,0,2021-01-01 08:23:36,0,66,barnet eastbarnet eastbarnetvillage churchhill...,"St Mary's Church Hall, East Barnet",POINT (-0.16115 51.63960)
1,50786422771,529,50256734@N05,16,0,2021-01-01 08:23:36,0,66,barnet eastbarnet eastbarnetvillage churchhill...,"St Mary's Church Hall, East Barnet",POINT (-0.16115 51.63960)
2,50786782471,94,50256734@N05,16,0,2021-01-01 08:24:06,0,66,barnet eastbarnet eastbarnetvillage churchhill...,"St Mary's Church Hall, East Barnet",POINT (-0.16115 51.63960)
3,50786884952,1236,50256734@N05,16,0,2021-01-01 08:24:35,0,66,barnet eastbarnet eastbarnetvillage churchhill...,"St Mary's Church Hall, East Barnet",POINT (-0.16115 51.63960)
4,50786884617,1154,50256734@N05,16,0,2021-01-01 08:25:00,0,66,barnet eastbarnet eastbarnetvillage churchhill...,"St Mary's Church Hall, East Barnet",POINT (-0.16115 51.63960)
...,...,...,...,...,...,...,...,...,...,...,...
31275,51797753548,35093,16255303@N00,15,0,2021-12-31 07:50:38,0,66,london globe kew oxfordstreet christmas boroug...,12 30 21 Globe (176 of 184),POINT (-0.09716 51.50819)
31276,51791444921,34347,193930781@N03,16,0,2021-12-31 12:25:27,0,66,abxair dhl boeing767 767300f cargo eddk cgn eg...,N372CM | London Heathrow | 31/12/21,POINT (-0.45902 51.47000)
31277,51791411415,34727,13407304@N05,16,0,2021-12-31 12:39:37,0,66,tolworth broadway a240 surrey greater london e...,IMGP3303,POINT (-0.28157 51.38004)
31278,51790789628,34323,13407304@N05,16,0,2021-12-31 12:43:52,0,66,tolworth broadway a240 surrey greater london e...,IMGP3306,POINT (-0.28157 51.38004)


In [86]:
# group data by users
gdf_users_gb = gdf.groupby('owner')

# get the list of users
users_ls = list(gdf_users_gb.groups.keys())

In [132]:
# visualize the distribution of photos taken by the specific user
user_test = gdf_users_gb.get_group('50256734@N05')

map = folium.Map(location = [51.9,0], zoom_start = 10)

for date, title, tags, geometry in user_test[['datetaken', 'title', 'tags', 'geometry']].values:
    # Place the markers with the popup labels and data
    folium.Marker(location = [geometry.xy[1][0], geometry.xy[0][0]],
                  tooltip ='<strong>Date: </strong>{0}<br><strong>Title: </strong>{1}</br><strong>Tags: </strong>{2}'.format(date, title, tags)
                  ).add_to(map)

map


## Clean Data

## Visualize Data

In [None]:
# Python program to generate WordCloud

# importing all necessary modules
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd

# Reads 'Youtube04-Eminem.csv' file
# df = pd.read_csv(r"Youtube04-Eminem.csv", encoding ="latin-1")

comment_words = ''
stopwords = set(STOPWORDS)

# iterate through the csv file
for val in newyork_flickr_2021.tags:
  # typecaste each val to string
  val = str(val)
  if val != 'newyork' and val != 'New York' and val != 'newyorkcity' and val != 'covid' and val != 'covid19':
    # split the value
    tokens = val.split()
    
    # Converts each token into lowercase
    for i in range(len(tokens)):
      tokens[i] = tokens[i].lower()
    
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
				background_color ='white',
				stopwords = stopwords,
				min_font_size = 10).generate(comment_words)

# plot the WordCloud image					
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()


In [None]:
# get unique tags

# importing all necessary modules
from wordcloud import WordCloud, STOPWORDS

comment_words = ''
stopwords = set(STOPWORDS)
tag_ls = []
# iterate through the csv file
for val in newyork_flickr_2021.tags:
  # typecaste each val to string
  val = str(val)
  
  # split the value
  tokens = val.split()
  
  # Converts each token into lowercase
  for i in range(len(tokens)):
    tokens[i] = tokens[i].lower()
    tag_ls.append(tokens[i])

  comment_words += " ".join(tokens)+" "

set(tag_ls)

{'tourgroup',
 'upperbay',
 'arthuravenue',
 'ducks',
 'maloney',
 'winterhat',
 'carriestrafficon155thstreetfromseventhavenuetotheintersectionwithedgecombeavenueandstnicholasplacethebridgeis2',
 'ilovebk',
 'all',
 'bright',
 'nycbrooklyn',
 'conservatorygarden',
 'journalist',
 'f3hp',
 'supertall',
 'sonyalpha',
 'halloween',
 'trainstation',
 'metropolitantransitauthority',
 'hoodie',
 'the',
 'january',
 'wayfinding',
 'felendzer',
 'hardware',
 'signsofgaypride2021nyc',
 'voigtlander',
 '350900',
 'fromthecar',
 'minion',
 'a6blq',
 'coordinates40°48′40″n73°56′00″wcarries4tracksofthemetronorthrailroadcrossesharlemriverlocalemanhattanandthebronxinnewyorkcitymaintainedbymetropolitantransportationauthority',
 'streamers',
 'foosball',
 'owl',
 'greenlight',
 'motrcycle',
 'bdsf',
 'devotion',
 'cabbagetown',
 'unga',
 'expedition',
 'island',
 '90sce',
 'ciroflexmodeld',
 'pathway',
 'streetview',
 'leicamp',
 'oil',
 'late',
 '540feet770mlongintotal',
 'pier',
 'fresh',
 'cmacgm',
