In [None]:
# Import neccessary libraries and tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime as dt
from sklearn.cluster import DBSCAN
import warnings 
warnings.filterwarnings('ignore')

df = pd.read_json('livedata.json')

In [None]:
# Have a first look at the data
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

### Let's visualize the data in a scatter plot using Seaborn
Requirements:
- Figure size of 8x6
- x-axis is 'latitute' and y-axis is 'longitude'
- Legend by 'id

In [None]:
'''
Visualize the data by scatter plot
'''
# Please fill your answer in '...'
plt.figure(figsize=(...))
sns.scatterplot(data=..., x=..., y=..., hue=...)
plt.legend(bbox_to_anchor=[1, 0.8])
plt.show()

# DBSCAN (**D**ensity-**B**ased **S**patial **C**lustering of **A**pplications with **N**oise)
Before going to the main model, we want you to have a fundamental view about the algorithm. DBSCAN is a popular **unsupervised** learning method utilized in model building and machine learning algorithms. in machine learning, unsupervised learning aims to identify patterns in data sets containing data points that are neither classified nor labeled. Indeed, clustering is one of the main concern of unsupervised learning problems. 
**DBSCAN** is a clustering algorithm specified for seeking areas in the data that have a high density of observation versus areas that are sparse with observations. DBSCAN works with these rules:
- Divides the dataset into n dimensions
- For each point in the dataset, DBSCAN forms an n dimensional shape around that data point, and then counts how many data points fall within that shape.
- DBSCAN counts this shape as a cluster. DBSCAN iteratively expands the cluster, by going through each individual point within the cluster, and counting the number of other data points nearby. Take the graphic below for an example:
<!-- ![image.png](attachment:1ee0125c-ad99-42d3-aae4-e9a1b7af5039.png) -->
<img src="https://miro.medium.com/max/2000/1*zbm_3K647rvNDmgL6HWUNQ.png" alt="Drawing" style="width:70%;">

After DBSCAN has done so, it will start at a random point (in this case lets assume it was one of the red points), and it will count how many other points are nearby. DBSCAN will continue this process until no other data points are nearby, and then it will look to form a second cluster. 
There are a couple parameters and specifications that we need to give DBSCAN before it does its work. These are: 
- eps: the maximum distance between two samples for one to be considered as in the neighborhood of the other.
- min_samples: the number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
- metric: the metric to use when calculating distance between instances in a feature array.

### Define the model
Let's define the model using DBSCAN algorithm. In this case:
- We want to trace the person who has the close contact of 6 feet (1.8288 meters) with the infected patient.
- For each cluster, there are minimum of 2 samples to be formed.
- The utilized metric is 'haversine'.

In [None]:
# Please fill your answer in '...' 
model = DBSCAN(eps=..., min_samples=..., metric=...).fit(df[['latitude', 'longitude']])

### Plot the model with legend by 'cluster'

In [None]:
labels = model.labels_
fig = plt.figure(figsize=(8,6))
sns.scatterplot(df['latitude'], df['longitude'], hue = ['cluster {}'.format(x) for x in labels])
plt.legend(bbox_to_anchor = [1, 1])
plt.show()

# Tracing infected people
Let's find people who have close contact with the infected patient. In this case we’ll define the function 'get_infected_names' whose the input is an infected patient:

In [None]:
'''
Define the function to trace the infected persons
'''
# Please fill your answer in '...'
def get_infected_names():   
    infected_name = input()
    while (infected_name not in df['id'].tolist()):
        print('No name found! Please do again!')
        infected_name = input()
        
    df['cluster'] = model.labels_.tolist()
    infected_name_clusters = [] 
    
    '''
    Try to find out which cluster does the input name belong to:
    - Step 1: Go through the list of cluster and find out all the cluster that the infected name belong to.
    - Step 2: Check the infected_name_cluster list whether those clusters are included or not.
    '''
    for i in range(...):
        if ...:
            if ...
                pass
            else:
                ...
           
    '''
    Try to find out the persons who have close contact with the infected patient:
    - Step 1: Find out the id (name) that belong to each cluster in infected_name_clusters.
    - Step 2: Add these names ((!)different from infected_name (!)) to the infected list.
    '''
    infected_list = []
    for cluster in infected_name_clusters:
        if cluster != -1: # not belong to any valid cluster 
            ids_in_cluster = df.loc[..., ...]
            for i in range (...):
                member_id = ids_in_cluster.iloc[...]
                if (member_id not in infected_list) and (...):
                    ...
                else:
                    pass
                
    return infected_list