### 6.3 Geographic Visualisation  
1. Import data and libraries  
2. Data wrangling  
3. Data cleaning  
4. Plotting a choropleth  
5. Discussing the result  

In [1]:
# 1. Importing data and libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [2]:
# This option ensures the charts are displayed in the notebook without the need to "call" them specifically.
%matplotlib inline

In [3]:
# create a path
path = r'/Users/macbook/Library/CloudStorage/OneDrive-Personal/Data Analisys/Boat Sales'

In [4]:
# Import ".json" file for the entire globe.
Country_geo =  r'/Users/macbook/Library/CloudStorage/OneDrive-Personal/Data Analisys/Boat Sales/02 Data/Prepared Data/custom.geo-2.json'

In [5]:
# That's just in case you want to look at the JSON file contents here too:

f = open(r'/Users/macbook/Library/CloudStorage/OneDrive-Personal/Data Analisys/Boat Sales/02 Data/Prepared Data/custom.geo-2.json')
  
# returns JSON object asa dictionary
data = json.load(f)
  
# Iterating through the json list
for i in data['features']:
    print(i)

{'type': 'Feature', 'properties': {'featurecla': 'Admin-0 country', 'scalerank': 1, 'labelrank': 2, 'sovereignt': 'Germany', 'sov_a3': 'DEU', 'adm0_dif': 0, 'level': 2, 'type': 'Sovereign country', 'tlc': '1', 'admin': 'Germany', 'adm0_a3': 'DEU', 'geou_dif': 0, 'geounit': 'Germany', 'gu_a3': 'DEU', 'su_dif': 0, 'subunit': 'Germany', 'su_a3': 'DEU', 'brk_diff': 0, 'name': 'Germany', 'name_long': 'Germany', 'brk_a3': 'DEU', 'brk_name': 'Germany', 'brk_group': None, 'abbrev': 'Ger.', 'postal': 'D', 'formal_en': 'Federal Republic of Germany', 'formal_fr': None, 'name_ciawf': 'Germany', 'note_adm0': None, 'note_brk': None, 'name_sort': 'Germany', 'name_alt': None, 'mapcolor7': 2, 'mapcolor8': 5, 'mapcolor9': 5, 'mapcolor13': 1, 'pop_est': 83132799, 'pop_rank': 16, 'pop_year': 2019, 'gdp_md': 3861123, 'gdp_year': 2019, 'economy': '1. Developed region: G7', 'income_grp': '1. High income: OECD', 'fips_10': 'GM', 'iso_a2': 'DE', 'iso_a2_eh': 'DE', 'iso_a3': 'DEU', 'iso_a3_eh': 'DEU', 'iso_n3':

In [6]:
# Import boat_data_cleaned.csv
df = pd.read_pickle(os.path.join(path,'02 Data', 'Prepared Data', 'boat_sales_cleaned2.pkl'))

In [7]:
df.head()

Unnamed: 0,Boat Type,Manufacturer,Type,Year Built,Length,Width,Material,Number of views last 7 days,EUR Price,Country,City,Pricing Category,Boat Class
1,Center console boat,Terhi power boats,new boat from stock,2020,4.0,1.5,Thermoplastic,75,3490.0,Germany,BÃ¶nningstedt,Low-priced,Class A
4,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,58,3399.0,Germany,Bayern,Low-priced,Class A
10,Fishing Boat,Terhi power boats,"Used boat,Electric",1987,4.35,1.75,GRP,239,3570.0,Switzerland,Seengen,Low-priced,Class A
12,Sport Boat,GS Nautica power boats,Used boat,2004,4.7,2.0,GRP,69,3500.0,Italy,Lake Garda,Low-priced,Class A
13,Runabout,Kimple power boats,new boat from stock,2020,4.4,1.65,Aluminium,113,4692.0,Switzerland,Zugersee,Low-priced,Class A


### 2. Data Wrangling

In [8]:
# Select only the necessary columns and put them in a list called columns


In [9]:
#create a new column for total view for each country
df['Total_views'] = df.groupby(['Country'])['Number of views last 7 days'].transform(np.sum)

In [10]:
df

Unnamed: 0,Boat Type,Manufacturer,Type,Year Built,Length,Width,Material,Number of views last 7 days,EUR Price,Country,City,Pricing Category,Boat Class,Total_views
1,Center console boat,Terhi power boats,new boat from stock,2020,4.00,1.50,Thermoplastic,75,3490.00,Germany,BÃ¶nningstedt,Low-priced,Class A,203469
4,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,58,3399.00,Germany,Bayern,Low-priced,Class A,203469
10,Fishing Boat,Terhi power boats,"Used boat,Electric",1987,4.35,1.75,GRP,239,3570.00,Switzerland,Seengen,Low-priced,Class A,316879
12,Sport Boat,GS Nautica power boats,Used boat,2004,4.70,2.00,GRP,69,3500.00,Italy,Lake Garda,Low-priced,Class A,116749
13,Runabout,Kimple power boats,new boat from stock,2020,4.40,1.65,Aluminium,113,4692.00,Switzerland,Zugersee,Low-priced,Class A,316879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9877,Cabin Boat,Princess power boats,Used boat,1974,7.20,2.70,GRP,1299,5100.00,Switzerland,Rhein,Low-priced,Class I,316879
9878,"Sport Boat,Wakeboard/Wakesurf,Water ski",Tullio Abbate power boats,"Used boat,Unleaded",1980,6.00,2.10,GRP,1917,5098.98,Switzerland,Lake of Zurich,Low-priced,Class I,316879
9882,Fishing Boat,Staempfli power boats,"Used boat,Unleaded",1984,6.00,1.62,Plastic,288,5049.00,Switzerland,Bielersee,Low-priced,Class I,316879
9885,Sport Boat,BlueCraft power boats,"new boat from stock,Unleaded",2020,4.40,1.80,GRP,354,4499.00,Germany,Nordrhein-Westfalen,Low-priced,Class A,203469


In [11]:
# subsetting the columns for the map
view_map = df[['Country', 'Total_views']]

In [12]:
view_map.head()

Unnamed: 0,Country,Total_views
1,Germany,203469
4,Germany,203469
10,Switzerland,316879
12,Italy,116749
13,Switzerland,316879


In [13]:
view_map.shape

(6001, 2)

### 3. Data cleaning

In [14]:
# Checking subset for nulls
view_map.isnull().sum()

Country        0
Total_views    0
dtype: int64

In [15]:
# The frequency of listing by countries
view_map['Country'].value_counts(dropna = False)

Germany                         1209
Italy                            918
Switzerland                      900
France                           870
Spain                            508
Croatia (Hrvatska)               472
Netherlands                      417
United Kingdom                   139
Portugal                         115
Austria                           66
Denmark                           63
Greece                            41
Turkey                            41
Belgium                           40
Malta                             25
Slovenia                          25
Poland                            23
Finland                           22
Sweden                            13
Montenegro                        12
Slovak Republic                    9
Lake Constance                     9
Lake Geneva                        6
Monaco                             6
Cyprus                             5
Hungary                            4
United States                      4
E

In [16]:
boat_country = df['Country'].value_counts(dropna = False).rename_axis('Country').reset_index(name='counts')

In [17]:
boat_country

Unnamed: 0,Country,counts
0,Germany,1209
1,Italy,918
2,Switzerland,900
3,France,870
4,Spain,508
5,Croatia (Hrvatska),472
6,Netherlands,417
7,United Kingdom,139
8,Portugal,115
9,Austria,66


In [18]:
#Extracting update clean boat data
df.to_csv(os.path.join(path, 'boat_data_cleaned.csv'))

In [19]:
#edit croatia name
boat_country['Country'][5] = 'Croatia'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boat_country['Country'][5] = 'Croatia'


In [20]:
boat_country

Unnamed: 0,Country,counts
0,Germany,1209
1,Italy,918
2,Switzerland,900
3,France,870
4,Spain,508
5,Croatia,472
6,Netherlands,417
7,United Kingdom,139
8,Portugal,115
9,Austria,66


In [21]:
#Extracting boat_country data
boat_country.to_csv(os.path.join(path,'02 Data', 'Prepared Data','boat_country.csv'))

In [22]:
# drop duplicate values
view_map = view_map.drop_duplicates()

In [23]:
 boat_country['Country'][5]

'Croatia'

In [24]:
view_map.shape

(50, 2)

In [25]:
view_map.sort_values(by = 'Total_views', ascending=False)

Unnamed: 0,Country,Total_views
10,Switzerland,316879
1,Germany,203469
12,Italy,116749
18,France,79308
189,Croatia (Hrvatska),62671
44,Spain,60234
101,Netherlands,50666
193,Portugal,13337
64,Austria,12666
605,United Kingdom,12005


In [26]:
# Setting up a map
map = folium.Map(location = [100, 0], zoom_start = 1.5)


folium.Choropleth(
    geo_data = Country_geo, 
    data = view_map,
    columns = ['Country', 'Total_views'],
    key_on = 'feature.properties.name', 
    fill_color = 'YlOrBr', fill_opacity=0.5, line_opacity=0.1,
    legend_name = "No of views").add_to(map)
folium.LayerControl().add_to(map)

map

In [27]:
# Setup a folium map at a high-level zoom
map = folium.Map(location = [100, 0], zoom_start = 1.5)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = Country_geo, 
    data = view_map,
    columns = ['Country', 'Total_views'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "rating").add_to(map)
folium.LayerControl().add_to(map)

map

### 5. Discussing the result

Discuss the results and what they mean in a markdown section.  
Does the analysis answer any of your existing research questions?  
Does the analysis lead you to any new research questions?  
Question: What is the location of the most viewed boat?  
The most view boats are located in Switzerland, Germany and Italy with 316,879, 203,469 and 116,749 views respectively. Although, the number of boat listed in each countries also varies.  
However, Germany (1209), Italy (918) and Switzerland (900) has the highest listings.  
I don't have a new research question. However, I would like to explore the characteristics of the listings in order to analyze the profitability of each listing if I can.  