# MMTC Webscrape

In [1]:
# import libraries
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
# launch browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://knowthefactsmmj.com/mmtc/'
browser.visit(url)

In [3]:
# create Beautiful Soup object
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

## Dispensary Centers

In [4]:
# extract dispensary center table
table = soup.find('table', id='approveddispensing441')

In [5]:
# extract table headers

# extract table header row by extracting the first row
header_row = table.find('tr')

# extract header text
header_names = [header.find('span').text.strip() for header in header_row.find_all('td')]

In [6]:
# confirm data collection
print(header_names)

['Name', 'Phone', 'Email', 'Authorization Status', 'License Number']


In [7]:
# extract table data

#exclude the first row (header row)
data_rows = table.find_all('tr')[1:]

#row_cells = [row.find_all('td') for row in data_rows]
#row_data = [cell.text.strip() for cell in row_cells]

row_data = [[cell.text.strip() for cell in row.find_all('td')] for row in data_rows]

In [8]:
# confirm data collection
pprint(row_data)

[['Ayr Cannabis Dispensary',
  '833-254-4877',
  'Info@libertyhealthsciences.com',
  'Dispensing Authorization',
  'MMTC-2015-0002'],
 ['Cannabist',
  '800-714-9215',
  'info@col-carefl.com',
  'Dispensing Authorization',
  'MMTC-2017-0011'],
 ['Cookies Florida, Inc.',
  'n/a',
  'n/a',
  'Dispensing Authorization',
  'MMTC-2019-0018'],
 ['Curaleaf',
  '877-303-0741',
  'info.fl@curaleaf.com',
  'Dispensing Authorization',
  'MMTC-2015-0001'],
 ['FLUENT',
  '833-735-8368',
  'info@getfluent.com',
  'Dispensing Authorization',
  'MMTC-2015-0003'],
 ['Gold Leaf', 'n/a', 'n/a', 'Dispensing Authorization', 'MMTC-2019-0019'],
 ['Green Dragon',
  '720-600-9555',
  'support@greendragon.com',
  'Dispensing Authorization',
  'MMTC-2019-0021'],
 ['GrowHealthy',
  '863-223-8882',
  'info@GrowHealthy.com',
  'Dispensing Authorization',
  'MMTC-2016-0007'],
 ['GTI (Rise Dispensaries)',
  '305-306-8772',
  'FLinfo@gtigrows.com',
  'Dispensing Authorization',
  'MMTC-2017-0013'],
 ['House of Platinum

In [9]:
# import data into a dataframe

df_companies = pd.DataFrame(row_data, columns=header_names)

df_companies

Unnamed: 0,Name,Phone,Email,Authorization Status,License Number
0,Ayr Cannabis Dispensary,833-254-4877,Info@libertyhealthsciences.com,Dispensing Authorization,MMTC-2015-0002
1,Cannabist,800-714-9215,info@col-carefl.com,Dispensing Authorization,MMTC-2017-0011
2,"Cookies Florida, Inc.",,,Dispensing Authorization,MMTC-2019-0018
3,Curaleaf,877-303-0741,info.fl@curaleaf.com,Dispensing Authorization,MMTC-2015-0001
4,FLUENT,833-735-8368,info@getfluent.com,Dispensing Authorization,MMTC-2015-0003
5,Gold Leaf,,,Dispensing Authorization,MMTC-2019-0019
6,Green Dragon,720-600-9555,support@greendragon.com,Dispensing Authorization,MMTC-2019-0021
7,GrowHealthy,863-223-8882,info@GrowHealthy.com,Dispensing Authorization,MMTC-2016-0007
8,GTI (Rise Dispensaries),305-306-8772,FLinfo@gtigrows.com,Dispensing Authorization,MMTC-2017-0013
9,House of Platinum Cannabis,,,Dispensing Authorization,MMTC-2018-0014


In [10]:
# change all n/a values to null
df_companies.replace('n/a', np.nan, inplace=True)

# renaming
df_companies.replace('MüV', 'MuV', inplace=True)
df_companies.replace('Sunnyside*', 'Sunnyside', inplace=True)


df_companies

Unnamed: 0,Name,Phone,Email,Authorization Status,License Number
0,Ayr Cannabis Dispensary,833-254-4877,Info@libertyhealthsciences.com,Dispensing Authorization,MMTC-2015-0002
1,Cannabist,800-714-9215,info@col-carefl.com,Dispensing Authorization,MMTC-2017-0011
2,"Cookies Florida, Inc.",,,Dispensing Authorization,MMTC-2019-0018
3,Curaleaf,877-303-0741,info.fl@curaleaf.com,Dispensing Authorization,MMTC-2015-0001
4,FLUENT,833-735-8368,info@getfluent.com,Dispensing Authorization,MMTC-2015-0003
5,Gold Leaf,,,Dispensing Authorization,MMTC-2019-0019
6,Green Dragon,720-600-9555,support@greendragon.com,Dispensing Authorization,MMTC-2019-0021
7,GrowHealthy,863-223-8882,info@GrowHealthy.com,Dispensing Authorization,MMTC-2016-0007
8,GTI (Rise Dispensaries),305-306-8772,FLinfo@gtigrows.com,Dispensing Authorization,MMTC-2017-0013
9,House of Platinum Cannabis,,,Dispensing Authorization,MMTC-2018-0014


## Dispensary Locations

In [11]:
# extract dispensary locations table
table = soup.find('table', id='DataTables_Table_0')

In [12]:
# extract table headers
header_row = table.find('thead')
header_names = [header.text.strip() for header in header_row.find_all('th')]

In [13]:
# confirm data collection
print(header_names)

['COMPANY', 'ADDRESS', 'EMAIL ADDRESS', 'PHONE', 'CITY', 'ZIP CODE', 'COUNTY']


In [14]:
# extract table data
data = table.find('tbody')
data_rows = data.find_all('tr')

row_data = [[cell.text.strip() for cell in row.find_all('td')] for row in data_rows]

In [15]:
# confirm data collection
pprint(row_data)

[['Ayr Cannabis Dispensary',
  '6325 N Orange Blossom Trail',
  '-',
  '-',
  'Orlando',
  '32810',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '7390 Aloma Avenue',
  '-',
  '-',
  'Winter Park',
  '32792',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '4650 N Alafaya Trail',
  '-',
  '-',
  'Orlando',
  '32826',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '440 N State Road 19',
  '-',
  '-',
  'Palatka',
  '32177',
  'Putnam'],
 ['Ayr Cannabis Dispensary',
  '6930 Cypress Gardens Boulevard',
  '-',
  '-',
  'Winter Haven',
  '33884',
  'Polk'],
 ['Ayr Cannabis Dispensary',
  '13832 Landstar Boulevard',
  '-',
  '-',
  'Orlando',
  '32824',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '3701 W Lake Mary Boulevard',
  '-',
  '-',
  'Lake Mary',
  '32746',
  'Seminole'],
 ['Ayr Cannabis Dispensary',
  '8898 SW 144th Terrace',
  '-',
  '-',
  'Palmetto Bay',
  '33158',
  'Miami-Dade'],
 ['Ayr Cannabis Dispensary',
  '21529 Village Lakes Shopping Center Drive',
  '-',
  '-',
  'Land O’ Lakes',

In [16]:
# extract table data

# find table pages
pagination = soup.find('div', id='DataTables_Table_0_paginate')
pagination_buttons = pagination.find_all('a')
last_page_button = pagination_buttons[-2]
last_page = int(last_page_button.text)+1

row_data = []

# loop through all table pages
for x in range(1, last_page):
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', id='DataTables_Table_0')
    table_data = table.find('tbody')
    data_rows = table_data.find_all('tr')
    row_data.extend([[cell.text.strip() for cell in row.find_all('td')] for row in data_rows])

    # go to next page
    browser.links.find_by_partial_text('Next').click()

In [17]:
# confirm data collection
pprint(row_data)

[['Ayr Cannabis Dispensary',
  '6325 N Orange Blossom Trail',
  '-',
  '-',
  'Orlando',
  '32810',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '7390 Aloma Avenue',
  '-',
  '-',
  'Winter Park',
  '32792',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '4650 N Alafaya Trail',
  '-',
  '-',
  'Orlando',
  '32826',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '440 N State Road 19',
  '-',
  '-',
  'Palatka',
  '32177',
  'Putnam'],
 ['Ayr Cannabis Dispensary',
  '6930 Cypress Gardens Boulevard',
  '-',
  '-',
  'Winter Haven',
  '33884',
  'Polk'],
 ['Ayr Cannabis Dispensary',
  '13832 Landstar Boulevard',
  '-',
  '-',
  'Orlando',
  '32824',
  'Orange'],
 ['Ayr Cannabis Dispensary',
  '3701 W Lake Mary Boulevard',
  '-',
  '-',
  'Lake Mary',
  '32746',
  'Seminole'],
 ['Ayr Cannabis Dispensary',
  '8898 SW 144th Terrace',
  '-',
  '-',
  'Palmetto Bay',
  '33158',
  'Miami-Dade'],
 ['Ayr Cannabis Dispensary',
  '21529 Village Lakes Shopping Center Drive',
  '-',
  '-',
  'Land O’ Lakes',

In [18]:
# import data into a dataframe

df_locations = pd.DataFrame(row_data, columns=header_names)

df_locations

Unnamed: 0,COMPANY,ADDRESS,EMAIL ADDRESS,PHONE,CITY,ZIP CODE,COUNTY
0,Ayr Cannabis Dispensary,6325 N Orange Blossom Trail,-,-,Orlando,32810,Orange
1,Ayr Cannabis Dispensary,7390 Aloma Avenue,-,-,Winter Park,32792,Orange
2,Ayr Cannabis Dispensary,4650 N Alafaya Trail,-,-,Orlando,32826,Orange
3,Ayr Cannabis Dispensary,440 N State Road 19,-,-,Palatka,32177,Putnam
4,Ayr Cannabis Dispensary,6930 Cypress Gardens Boulevard,-,-,Winter Haven,33884,Polk
...,...,...,...,...,...,...,...
584,VidaCann,2007 W Kennedy Boulevard,-,-,Tampa,33606,Hillsborough
585,VidaCann,5203 Cortez Road W,-,-,Bradenton,34210,Manatee
586,VidaCann,1101 S Powerline Road,-,-,Deerfield Beach,33442,Broward
587,VidaCann,1663 Georgia Street NE,-,-,Palm Bay,32907,Brevard


In [19]:
browser.quit()

In [20]:
# export df

df_companies_path = 'exports/df_companies.csv'
df_locations_path = 'exports/df_locations.csv'

df_companies.to_csv(df_companies_path, index=False)
df_locations.to_csv(df_locations_path, index=False)