# Title
by Zaki Aslam, Hector Palafox Prieto, Jennifer Tsang, and Samrawit Mezgebo Tsegay

In [15]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from scipy.stats import lognorm, randint

# Summary

Here is the summary of our project...

- similar to the "About" section in the README

# Introduction

- provide some relevant background information on the topic so that someone unfamiliar with it will be prepared to understand the rest of your report
- clearly state the question you tried to answer with your project
- identify and describe the dataset that was used to answer the question

# Methods & Results

## Data

## Analysis

- describe in written english the methods you used to perform your analysis from beginning to end that narrates the code the does the analysis.
- your report should include code which:
    - loads data from the original source on the web
    - wrangles and cleans the data from it’s original (downloaded) format to the format necessary for the planned
    - classification or clustering analysis
    - performs a summary of the data set that is relevant for exploratory data analysis related to the planned classification analysis
    - creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned classification analysis
    - performs classification or regression analysis
    - creates a visualization of the result of the analysis
- note: all tables and figure should have a figure/table number and a legend

In [2]:
# loads data from the original source on the web
food_vendors = pd.read_csv("../data/food-vendors.csv", sep = ";")

food_vendors

Unnamed: 0,KEY,VENDOR_TYPE,STATUS,BUSINESS_NAME,LOCATION,DESCRIPTION,GEO_LOCALAREA,Geom,geo_point_2d
0,EB04,vendor_food,open,,South Side of 1700 Beach Ave - East Side of Bi...,Hot Dogs,West End,"{""coordinates"": [-123.142926123068, 49.2844925...","49.2844925173, -123.142926123"
1,OD05,vendor_food,open,,South Side of 1700 East Broadway - 37 Metres E...,Hot Dogs,Kensington-Cedar Cottage,"{""coordinates"": [-123.069072741187, 49.2622160...","49.2622160477, -123.069072741"
2,DT10,vendor_food,open,,Expo Boulevard - 20 Metres West of Abbott St,Hot Dogs,Downtown,"{""coordinates"": [-123.108008079327, 49.2787930...","49.2787930538, -123.108008079"
3,C18,vendor_food,open,Aussie Pie Guy,Authorised Parking Meter - 1000 W Pender St,Australian Pies,Downtown,"{""coordinates"": [-123.118192426382, 49.2869519...","49.2869519782, -123.118192426"
4,C17,vendor_food,open,Le Tigre,Authorised Parking Meter North Side of Alberni...,Chinese Cuisine,West End,"{""coordinates"": [-123.12561890857, 49.28659154...","49.2865915482, -123.125618909"
...,...,...,...,...,...,...,...,...,...
86,DT52,vendor_food,open,Japadog,West Side of 800 Burrard St - 2.5 Metres North...,Hot Dogs,West End,"{""coordinates"": [-123.124363670916, 49.2824199...","49.2824199268, -123.124363671"
87,GM10,vendor_food,open,Potato Tornado,West Side of 800 Granville St - 6 Metres South...,Potato Based Dishes,Downtown,"{""coordinates"": [-123.120079603127, 49.2812971...","49.2812971849, -123.120079603"
88,C25,vendor_food,open,Roaming Dragon,Authorised Parking Meter East Side of 1600 Man...,Asian Fusion,Mount Pleasant,"{""coordinates"": [-123.106548732322, 49.2713078...","49.2713078459, -123.106548732"
89,C26,vendor_food,open,Super Thai,Authorised Parking Meter North Side of 1000 W ...,Thai Cuisine,Downtown,"{""coordinates"": [-123.122623764676, 49.2855321...","49.285532115, -123.122623765"


In [3]:
# data wrangling and cleaning

# dropping irrelevant columns
clean_food = food_vendors.drop(columns=['KEY', 'VENDOR_TYPE', 'STATUS'])

clean_food["is_hotdog"] = clean_food["DESCRIPTION"] == "Hot Dogs"
clean_food["BUSINESS_NAME"] = clean_food["BUSINESS_NAME"].fillna("")

clean_food.head()

Unnamed: 0,BUSINESS_NAME,LOCATION,DESCRIPTION,GEO_LOCALAREA,Geom,geo_point_2d,is_hotdog
0,,South Side of 1700 Beach Ave - East Side of Bi...,Hot Dogs,West End,"{""coordinates"": [-123.142926123068, 49.2844925...","49.2844925173, -123.142926123",True
1,,South Side of 1700 East Broadway - 37 Metres E...,Hot Dogs,Kensington-Cedar Cottage,"{""coordinates"": [-123.069072741187, 49.2622160...","49.2622160477, -123.069072741",True
2,,Expo Boulevard - 20 Metres West of Abbott St,Hot Dogs,Downtown,"{""coordinates"": [-123.108008079327, 49.2787930...","49.2787930538, -123.108008079",True
3,Aussie Pie Guy,Authorised Parking Meter - 1000 W Pender St,Australian Pies,Downtown,"{""coordinates"": [-123.118192426382, 49.2869519...","49.2869519782, -123.118192426",False
4,Le Tigre,Authorised Parking Meter North Side of Alberni...,Chinese Cuisine,West End,"{""coordinates"": [-123.12561890857, 49.28659154...","49.2865915482, -123.125618909",False


In [5]:
# create train and test split

train_data, test_data = train_test_split(
    clean_food, train_size=0.7, random_state=522
)


In [6]:
# summary EDA, identify missing and NAN values

train_data["text_is_na"] = train_data["BUSINESS_NAME"] == ""
train_data["is_hotdog"] = train_data["DESCRIPTION"] == "Hot Dogs"

(
    alt.Chart(train_data).mark_bar().encode(
        x="count()",
        y=alt.Y("DESCRIPTION").sort('-x')
    )

    | alt.Chart(train_data).mark_bar().encode(
        x="text_is_na",
        y="is_hotdog",
        color="count()"
    )
)

In [16]:
# Data visualization for EDA


52

In [8]:
def build_pipeline(model):
    return make_pipeline(
        CountVectorizer(),
        model
    )

In [10]:
# classification/regression analysis

X_train = train_data["BUSINESS_NAME"]
y_train = train_data["is_hotdog"]

X_test = test_data["BUSINESS_NAME"]
y_test = test_data["is_hotdog"]

bag_of_words = build_pipeline(DummyClassifier())
bag_of_words.fit(X_train, y_train)
vocab = bag_of_words.named_steps["countvectorizer"].get_feature_names_out()

95

In [7]:
# classification/regression analysis 2

naive_bayes = build_pipeline(BernoulliNB())


In [8]:
# visualization of the result of the analysis


# Discussion

- summarize what you found
- discuss whether this is what you expected to find?
- discuss what impact could such findings have?
- discuss what future questions could this lead to?

# References

at least 4 citations relevant to the project (format is your choose, just be consistent across the references).

# Note:
- Make sure to render to PDF form and push to GitHub.
- Everyone should contribute equally to the code and writing!
- Git commit messages should be meaningful; they will be marked!
- Use GitHub issues to assign tasks and communicate!
- Proper grammar and full sentences in README