## Intialisation  

Loading of packages and files. There are some packages which will be loaded at a later stage for ease of reference, though I understand that the best practice is to import all required packages up front.

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# For encoding of hashed id into something easier to tag
from sklearn import preprocessing

# Maps Requires folium package - requires branca and python-abi as dependencies.
# Installed using conda-forge. On government workstation may not be so straightforward due to internet separation.
import folium
from folium.plugins import MarkerCluster

# For text mining
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import wordcloud

In [5]:
data = pd.read_csv("sd_sample_dataset.csv")

In [35]:
#data.head(3)

## PRE-PROCESSING  

This section here details the rough steps to process the data.

In [36]:
data2 = data.copy()

In [37]:
data2.columns

Index(['Case Record Key', 'Received Date', 'Received Time',
       'Case Owner Agency', 'Mobile Categories', 'Mobile Sub Categories',
       'Description', 'Incident Block', 'Incident Street Name',
       'Incident Postal Code', 'Channel', 'address', 'location_desc',
       'Customer_ID', 'latitude', 'longitude'],
      dtype='object')

In [38]:
# Columns are renamed for easier wrangling
newcols = ["CASE ID", "DATE", "TIME", "AGENCY", "CATEGORY", "SUBCATEGORY", "DESCRIPTION", "BLOCK", "STREET_NAME", "POSTAL_CODE", "CHANNEL", "ADDRESS", "LOCATION_DESC", "CUST_ID", "LATITUDE", "LONGITUDE"]
data2.columns = newcols

In [39]:
# Check unique values
print(data2["AGENCY"].unique())

['HDB' 'NParks' 'Enterprise SG' 'SFA' 'LTA' 'NEA' 'PA']


In [40]:
# Can be used as variable if needed to convert back
agency_dict = {"NParks":"NPARKS", "Enterprise SG":"ESG"}

# Replace so that it is easier to tag.
data2["AGENCY"] = data2["AGENCY"].replace(agency_dict)

In [41]:
# Only safe distancing
print(data2["CATEGORY"].unique())

# Only OS App. Perhaps andriod version not developed then (May 2020).
print(data2["CHANNEL"].unique())

['Safe Distancing']
['OS App']


In [42]:
# For consistency
data2["CATEGORY"] = data2["CATEGORY"].apply(str.upper)
data2["CHANNEL"] = data2["CHANNEL"].apply(str.upper)

In [43]:
# For purposes of this assessment, we can remove these columns (as they have same values throughout)
# However, if we have the full data, we can perform further analysis from it.
# The type of analysis required would depend on the problem statement being asked.
data2 = data2.drop(columns = ["CATEGORY", "CHANNEL"])

In [44]:
# Provide easier keys to categorize later on
print(data2["SUBCATEGORY"].unique())

['HDB Town/Neighbourhood Ctrs & HDB Common Areas' 'Parks & PCNs'
 'Malls and Commercial Areas' 'Coffeeshops & F&B in HDB Estates'
 'Transport Nodes' 'Hawker Ctrs & Wet Markets' 'Community Clubs']


In [45]:
subcat_dict = {"HDB Town/Neighbourhood Ctrs & HDB Common Areas":"HDB_ESTATE_AREA", "Parks & PCNs":"PARKS_PCNS", 
     "Malls and Commercial Areas": "MALLS_COMMERCIALS", "Coffeeshops & F&B in HDB Estates": "HDB_F&B_COFFEESHOP", 
     "Transport Nodes": "TRANSPORT_NODES", "Hawker Ctrs & Wet Markets": "HAWKER_WETMKT","Community Clubs": "COMMUNITY_CLUBS"}
data2["SUBCATEGORY"] = data2["SUBCATEGORY"].replace(subcat_dict)

In [46]:
#data2.head(2)

In [47]:
# Combine date and time together
data2["DATETIME"] = data2["DATE"] + " " + data2["TIME"]
data2["DATETIME"] = pd.to_datetime(data2["DATETIME"], format = "%d/%m/%Y %I:%M:%S %p")

In [48]:
# Remove date and time columns as the info is subsumed in datetime
data3 = data2.drop(columns = ["DATE", "TIME"])

In [49]:
# CUST_ID can be used to differentiate customer. To convert it into something easier, like case id.

label_enc = preprocessing.LabelEncoder()
test = label_enc.fit_transform(data3["CUST_ID"])

data3["NEWCUST_ID"] = test

In [50]:
# To keep just in case
unused_df = data3.loc[:, ["CASE ID", "CUST_ID", "NEWCUST_ID"]]
data3 = data3.drop(columns = "CUST_ID")

In [51]:
#data3.head(3)

In [52]:
# Only run if you have not downloaded the stopwords
# Ensure that the downloaded package is in a folder that exists in nltk.data.path. Otherwise have to add it.
# nltk.download("stopwords")

## Analysis  

This section will analyse the data and attempt to generate insights.

In [53]:
# latitude and longitude of Singapore.
sg = [1.3521, 103.8198]

__Create functions for ease of visualisation and processing.__

In [26]:
def create_folium(df, location = [1.3521, 103.8198], tiles = "CartoDB positron", zoom_start = 11, filenm = "output.html"):
    
    '''
    Function to create folium maps.
    
    '''
    location_list = [[a,b] for a, b in zip(df["LATITUDE"], df["LONGITUDE"])]
    
    mymap = folium.Map(location = location, tiles = tiles, zoom_start = zoom_start)
    
    # Add market clusters.
    
    marker_cluster = MarkerCluster().add_to(mymap)

    for point in range(0, len(location_list)):
        popup = "Location:" + str(df["POSTAL_CODE"].iloc[point]) +  "/n" + df["DESCRIPTION"].iloc[point]
        folium.Marker(location_list[point], popup = popup).add_to(marker_cluster)

    mymap.save(filenm) 

In [27]:
# Create function for data processing.

def text_mine(text, punctuation = "!@#$%^&*()_+<>?:.,;", stop_words = None):
    
    ''' 
    Will tokenize sentence to words.
    Replace punctuations with "" i.e. remove them.
    Remove stop words.
    Return list of words & elements. 
    
    Requires nltk.word_tokenize
    '''
    if stop_words == None:
        stop_words = set(stopwords.words("English"))
    
    if type(text) == list:
        text = " ".join(text)
    
    # Remove punctuations
    word_text = word_tokenize(text)
    new_words = []
    for word in word_text:
        for character in word:
            if character in punctuation:
                word = word.replace(character, "")
                
        # Word tokenize will separate punctuations
        # Punctuations will be replaced with "". No need to append.
        if word != "":
            new_words.append(word)
            
    # Conduct stemming to group same words (walk and walking)
    # Not done for readability in wordcloud output.
    #stemmer = PorterStemmer()
    #stemmed_words = [stemmer.stem(a) for a in new_words]
    stemmed_words = new_words.copy()
    
    filtered = []
    for words in stemmed_words:
        if words not in stop_words:
            filtered.append(words)
    
    return filtered

In [28]:
# Create function for wordcloud
def create_save_cloud(text, max_words = 100, bg_color = "white", filenm = "Output.png"):
    ''' Requires installation/imports of wordcloud'''
    
    if type(text) == list:
        input_txt = " ".join(text)

    word_cloud = wordcloud.WordCloud(max_words = max_words, background_color = bg_color, width=1600, height=800).generate(input_txt)

    # Display the generated image:
    plt.figure(figsize=(20,10))
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    #plt.show()
    plt.savefig(filenm)

__Apply these functions into desired data slices.__

In [71]:
# All feedbacks
# create_folium(data3, filenm = "cluster_feedback_all.html")

In [None]:
# 1 map + 1 wordcloud for each category.
# By agency in charge.
categories = list(data3["AGENCY"].unique())
filenms_html = ["./categories/" + agency + ".html" for agency in categories]
filenms_png = ["./categories/" + agency + ".png" for agency in categories]

for idx, cat in enumerate(categories):
    df = data3.loc[data3["AGENCY"] == cat, :]
    create_folium(df, filenm = filenms_html[idx])
    
    text = " ".join(df["DESCRIPTION"])
    cleaned = text_mine(text)    
    create_save_cloud(cleaned, max_words = 150, filenm = filenms_png[idx])    

In [None]:
# By time of day
start_time = ["00:00:00", "08:00:00", "19:00:00"]
end_time = ["07:59:59", "18:59:59", "23:59:59"]
times = ["12mn_8am", "8am_7pm", "7pm_12mn"]
filenms_html = ["./time/" + time_day + ".html" for time_day in times]
filenms_png = ["./time/" + time_day + ".png" for time_day in times]

for idx, cat in enumerate(times):
    
    df = data3.set_index("DATETIME").between_time(start_time[0], end_time[0])
    create_folium(df, filenm = filenms_html[idx])
    
    text = " ".join(df["DESCRIPTION"])
    cleaned = text_mine(text)    
    create_save_cloud(cleaned, max_words = 150, filenm = filenms_png[idx])  

In [None]:
# By first and last day (1st May and 20th May)
start_day = ["2020-05-01 00:00:00", "2020-05-19 00:00:00"]
end_day = ["2020-05-01 23:59:59", "2020-05-19 23:59:59"]
dates = ["May-01", "May-19"]

filenms_html = ["./day/" + day + ".html" for day in dates]
filenms_png = ["./day/" + day + ".png" for day in dates]

for idx, cat in enumerate(dates):
    df = data3.loc[data3['DATETIME'].between(start_day[0], end_day[0]), :]
    create_folium(df, filenm = filenms_html[idx])
    
    text = " ".join(df["DESCRIPTION"])
    cleaned = text_mine(text)    
    create_save_cloud(cleaned, max_words = 200, filenm = filenms_png[idx])  

#### *Power users who show strong support in reporting potential residents that flout safe distancing measures*  

To drill into 'top 10' users of the app.

In [82]:
data3["NEWCUST_ID"].value_counts().nlargest(10)

3953    107
1732     75
447      36
3126     31
3346     30
5868     30
6297     26
3629     24
1560     23
2522     23
Name: NEWCUST_ID, dtype: int64

In [85]:
top10 = list(data3["NEWCUST_ID"].value_counts().nlargest(10).index)
print(top10)

[3953, 1732, 447, 3126, 3346, 5868, 6297, 3629, 1560, 2522]


In [86]:
# Let's check on each one. First we check based on postal code to see if they are all reporting the same place or not.
for top in top10:
    postal = list(data3.loc[data3["NEWCUST_ID"] == top, "POSTAL_CODE"].unique())
    print("User (based on hashed ID): {}. Unique postal codes reported = {}".format(str(top), str(postal)))

User (based on hashed ID): 3953. Unique postal codes reported = ['752510']
User (based on hashed ID): 1732. Unique postal codes reported = ['389379', '398721', '389380']
User (based on hashed ID): 447. Unique postal codes reported = ['150091', '320082', '320092', '320087', '50335', '89168', '328827', '50005', '50004', '320090', '89167', '80334', '151115', '50336']
User (based on hashed ID): 3126. Unique postal codes reported = ['121416', '120418', '121420']
User (based on hashed ID): 3346. Unique postal codes reported = ['18955', 'NIL', '39805', '18972', '18940', nan, '307591', '18953']
User (based on hashed ID): 5868. Unique postal codes reported = [nan, '679041']
User (based on hashed ID): 6297. Unique postal codes reported = [nan, '576462', 'NIL']
User (based on hashed ID): 3629. Unique postal codes reported = ['398722', '389379', '398721', '398730', nan]
User (based on hashed ID): 1560. Unique postal codes reported = ['NIL', '460420']
User (based on hashed ID): 2522. Unique postal 

In [None]:
# Power users
# By agency in charge.
filenms_html = ["./power_users/" + str(top) + ".html" for top in top10]
filenms_png = ["./power_users/" + str(top) + ".png" for top in top10]
filenms_txt = ["./power_users/" + str(top) + ".txt" for top in top10]

for idx, top in enumerate(top10):
    df = data3.loc[data3["NEWCUST_ID"] == top, :]
    create_folium(df, filenm = filenms_html[idx])
    
    text = " ".join(df["DESCRIPTION"])
    cleaned = text_mine(text)    
    create_save_cloud(cleaned, max_words = 150, filenm = filenms_png[idx])
    
    with open(filenms_txt[idx], "w", encoding="utf-8") as file:
        file.writelines("\n".join(df["DESCRIPTION"]))