In [1]:
# Imports

# Data import and manipulation
import pandas as pd
# Math
import numpy as np
# Let's go ahead and seed the notebook, for reproducibility
np.random.seed(113)

# This library allows us to search for a zipcode and retrieve lat/long data
from uszipcode import SearchEngine

# And then this library will help fill in the gaps for any nulls
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# And this will provide a progress bar
from tqdm import tqdm

In [2]:
# Specifying dtypes for faster input
dtypes = {
    "REPORTER_NAME": "object",
    "REPORTER_ZIP": "int64",
    "BUYER_NAME": "object",
    "BUYER_ZIP": "int64",
    "DRUG_NAME": "object",
    "TRANSACTION_DATE": "object",
    "DOSAGE_UNIT": "float64",
}

# Reading in only the columns related to the buyer's location
data = pd.read_csv("data/arcos-tx-statewide-itemized.tsv",
                   sep='\t',
                   usecols=["REPORTER_NAME", "REPORTER_ZIP", "BUYER_NAME", 
                            "BUYER_ZIP", "DRUG_NAME", "TRANSACTION_DATE", 
                            "DOSAGE_UNIT"],
                   dtype=dtypes)

In [3]:
data.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,1112006,500.0
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,1042006,500.0
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,11022006,400.0
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,1042007,100.0
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2142007,100.0


In [4]:
data.shape

(12108468, 7)

In [5]:
# First, let's change the transaction date to a datetime object
# Need to fill in the empty leading zeros, for single-digit months
data["TRANSACTION_DATE"] = data["TRANSACTION_DATE"].str.zfill(8)

In [6]:
# And now, changing to datetime
data["TRANSACTION_DATE"] = pd.to_datetime(data["TRANSACTION_DATE"], 
                                          format='%m%d%Y')
data["TRANSACTION_DATE"].head()

0   2006-01-11
1   2006-01-04
2   2006-11-02
3   2007-01-04
4   2007-02-14
Name: TRANSACTION_DATE, dtype: datetime64[ns]

### Getting Zipcode Latitude/Longitude Data

In [7]:
# Finding all unique reporter zipcodes
reporter_zip_unique = data["REPORTER_ZIP"].unique()

In [8]:
# And then all unique buyer zipcodes
buyer_zip_unique = data["BUYER_ZIP"].unique()

In [9]:
# And then combining the unique zipcodes into a list
all_zips = []
for r_zipc in reporter_zip_unique:
    all_zips.append(r_zipc)
for b_zipc in buyer_zip_unique:
    all_zips.append(b_zipc)

In [10]:
# Then making sure there are only unique zipcodes in our total list
unique_zips = set(all_zips)

In [11]:
# So we have 1375 total unique zipcodes in our dataset
len(unique_zips)

1375

In [12]:
# Instantiating the zipcode search engine
search = SearchEngine(simple_zipcode=True)

In [13]:
# Creating a dictionary and then lists to hold those lat/long values
zipcode_dict = {}
lats = []
longs = []

# Appending lat/long values to those lists
for zipcode in unique_zips:
    zipc = search.by_zipcode(zipcode)
    zipcode_dict[zipcode] = [zipc.lat, zipc.lng]
    lats.append(zipcode_dict[zipcode][0])
    longs.append(zipcode_dict[zipcode][1])

In [14]:
all_zips_df = pd.DataFrame.from_dict(zipcode_dict, orient='index', 
                                     columns=["LAT", "LONG"])

In [15]:
all_zips_df.reset_index(inplace=True)

In [16]:
all_zips_df.rename(columns={"index": "ZIP"}, inplace=True)

In [17]:
all_zips_df.head()

Unnamed: 0,ZIP,LAT,LONG
0,90249,33.9,-118.32
1,65801,,
2,33126,25.78,-80.3
3,33155,25.74,-80.31
4,74137,36.02,-95.94


In [53]:
all_zips_df.shape

(1375, 3)

In [18]:
none_zips = all_zips_df.loc[all_zips_df["LAT"].isna() == True]

In [20]:
# Instantiating our geolocator
geolocator = Nominatim(user_agent="LB_FIS_capstone", country_bias="us")

# Wrapping our geolocater in a rate limiter, to automatically add delays
# between requests - Nominatim (Open Street Maps) requests no more than one
# request per second
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Also adding a progress bar, to track
tqdm.pandas()

# Ignoring the depreciation warning, because we're using a rate limiter


`country_bias` argument of the Nominatim.__init__ is deprecated and will be removed in geopy 2.0. Use Nominatim.geocode(country_codes='us') instead.



In [21]:
found_zips = none_zips["ZIP"].progress_apply(geocode)

100%|██████████| 56/56 [01:24<00:00,  1.55s/it]


In [28]:
found_zips

1       (Springfield, Missouri, 65801, USA, (37.214781...
45      (Frisco, Texas, 75033, USA, (33.1640767078435,...
169                                                  None
170     (Dallas, Texas, 75266, United States of Americ...
174                                                  None
175                                                  None
195     (Mount Pleasant, Texas, 75456, USA, (33.154831...
198                                                  None
202     (Hopkins County, Texas, 75483, USA, (33.034417...
222     (Longview, Gregg County, Texas, 75608, USA, (3...
249                                                  None
286     (Angelina County, Texas, 75915, USA, (31.23825...
326     (Arlington, Texas, 76019, USA, (32.731541, -97...
353     (Palo Pinto County, Texas, 76068, USA, (32.863...
361                                                  None
399     (Denton, Texas, 76203, USA, (33.2095465000939,...
411     (Cooke County, Texas, 76241, USA, (33.60580673...
532     (Brown

In [60]:
found_df = none_zips.copy()
found_df["LOC"] = found_zips

In [61]:
# This at least had more success.
found_df["LAT"] = found_df["LOC"].apply(
    lambda loc: loc.latitude if loc else None)
found_df["LONG"] = found_df["LOC"].apply(
    lambda loc: loc.longitude if loc else None)

In [62]:
# Still missing 19, alas
len(found_df.loc[found_df["LOC"].isna() == True])

19

In [39]:
still_missing = found_df.loc[found_df["LOC"].isna() == True]

In [43]:
# Literally all of these are in Texas, what's up with that
# Also I just googled these to fill in this dictionary
still_missing_dict = {
    75262: [32.7800, -96.8000],
    75376: [32.7100, -96.8400],
    75382: [32.8644, -96.7439],
    75461: [33.6605, -95.5515],
    75712: [32.3500, -95.3000],
    76097: [32.5393, -97.3292],
    77225: [29.6928, -95.4176],
    77238: [29.9200, -95.4400],
    77347: [30.0000, -95.2500],
    77522: [29.7698, -94.9694],
    77572: [29.6523, -95.0273],
    77574: [29.5100, -95.0900],
    77641: [29.8689, -93.9338],
    77806: [30.6717, -96.3438],
    78278: [29.5614, -98.5613],
    78427: [27.6956, -97.4148],
    79114: [35.1556, -101.8839],
    79710: [32.0632, -102.0376],
    73137: [35.4700, -97.5200]
}

In [56]:
still_missing_df = pd.DataFrame.from_dict(still_missing_dict, orient='index',
                                          columns=["LAT", "LONG"])
still_missing_df.reset_index(inplace=True)
still_missing_df.rename(columns={"index": "ZIP"}, inplace=True)

In [58]:
still_missing_df.head()

Unnamed: 0,ZIP,LAT,LONG
0,75262,32.78,-96.8
1,75376,32.71,-96.84
2,75382,32.8644,-96.7439
3,75461,33.6605,-95.5515
4,75712,32.35,-95.3


In [54]:
all_zips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1375 entries, 0 to 1374
Data columns (total 3 columns):
ZIP     1375 non-null int64
LAT     1319 non-null float64
LONG    1319 non-null float64
dtypes: float64(2), int64(1)
memory usage: 32.3 KB


In [71]:
found_df.drop(columns="LOC", inplace=True)

In [72]:
found_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 1 to 1363
Data columns (total 3 columns):
ZIP     56 non-null int64
LAT     37 non-null float64
LONG    37 non-null float64
dtypes: float64(2), int64(1)
memory usage: 4.2 KB


In [57]:
still_missing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
ZIP     19 non-null int64
LAT     19 non-null float64
LONG    19 non-null float64
dtypes: float64(2), int64(1)
memory usage: 536.0 bytes


In [75]:
found_df = found_df.append(still_missing_df, ignore_index=True)

In [76]:
found_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 3 columns):
ZIP     75 non-null int64
LAT     56 non-null float64
LONG    56 non-null float64
dtypes: float64(2), int64(1)
memory usage: 1.8 KB


In [78]:
found_df.dropna(how="any", inplace=True)

In [79]:
found_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 0 to 74
Data columns (total 3 columns):
ZIP     56 non-null int64
LAT     56 non-null float64
LONG    56 non-null float64
dtypes: float64(2), int64(1)
memory usage: 1.8 KB


In [80]:
all_zips_df = all_zips_df.append(found_df, ignore_index=True)

In [81]:
all_zips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431 entries, 0 to 1430
Data columns (total 3 columns):
ZIP     1431 non-null int64
LAT     1375 non-null float64
LONG    1375 non-null float64
dtypes: float64(2), int64(1)
memory usage: 33.6 KB


In [82]:
all_zips_df.dropna(how="any", inplace=True)

In [83]:
all_zips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1375 entries, 0 to 1430
Data columns (total 3 columns):
ZIP     1375 non-null int64
LAT     1375 non-null float64
LONG    1375 non-null float64
dtypes: float64(2), int64(1)
memory usage: 43.0 KB


In [84]:
data.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,2006-01-11,500.0
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-01-04,500.0
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-11-02,400.0
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-01-04,100.0
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-02-14,100.0


In [89]:
data = data.merge(all_zips_df, how="left", left_on="REPORTER_ZIP", right_on="ZIP")

In [90]:
data.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT,ZIP,LAT,LONG
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,2006-01-11,500.0,11701,40.69,-73.41
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-01-04,500.0,11701,40.69,-73.41
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-11-02,400.0,11701,40.69,-73.41
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-01-04,100.0,11701,40.69,-73.41
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-02-14,100.0,11701,40.69,-73.41


In [91]:
data.rename(columns={"LAT": "REPORTER_LAT", "LONG": "REPORTER_LONG"}, 
            inplace=True)
data.drop(columns="ZIP", inplace=True)

In [92]:
data.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT,REPORTER_LAT,REPORTER_LONG
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,2006-01-11,500.0,40.69,-73.41
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-01-04,500.0,40.69,-73.41
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-11-02,400.0,40.69,-73.41
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-01-04,100.0,40.69,-73.41
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-02-14,100.0,40.69,-73.41


In [93]:
data = data.merge(all_zips_df, how="left", left_on="BUYER_ZIP", right_on="ZIP")

In [94]:
data.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT,REPORTER_LAT,REPORTER_LONG,ZIP,LAT,LONG
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,2006-01-11,500.0,40.69,-73.41,75662,32.4,-94.9
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-01-04,500.0,40.69,-73.41,77706,30.1,-94.17
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-11-02,400.0,40.69,-73.41,77706,30.1,-94.17
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-01-04,100.0,40.69,-73.41,77706,30.1,-94.17
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-02-14,100.0,40.69,-73.41,77706,30.1,-94.17


In [95]:
data.rename(columns={"LAT": "BUYER_LAT", "LONG": "BUYER_LONG"}, 
            inplace=True)
data.drop(columns="ZIP", inplace=True)

In [96]:
data.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT,REPORTER_LAT,REPORTER_LONG,BUYER_LAT,BUYER_LONG
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,2006-01-11,500.0,40.69,-73.41,32.4,-94.9
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-01-04,500.0,40.69,-73.41,30.1,-94.17
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-11-02,400.0,40.69,-73.41,30.1,-94.17
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-01-04,100.0,40.69,-73.41,30.1,-94.17
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-02-14,100.0,40.69,-73.41,30.1,-94.17


In [98]:
data.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12108468 entries, 0 to 12108467
Data columns (total 11 columns):
REPORTER_NAME       12108468 non-null object
REPORTER_ZIP        12108468 non-null int64
BUYER_NAME          12108468 non-null object
BUYER_ZIP           12108468 non-null int64
DRUG_NAME           12108468 non-null object
TRANSACTION_DATE    12108468 non-null datetime64[ns]
DOSAGE_UNIT         12108468 non-null float64
REPORTER_LAT        12108468 non-null float64
REPORTER_LONG       12108468 non-null float64
BUYER_LAT           12108468 non-null float64
BUYER_LONG          12108468 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 1.1+ GB


In [100]:
# Writing to a csv
data.to_csv(r"data/TX_Buyers_Shipment_Location_Data.csv", index=False)

In [101]:
# Sanity check
data_test = pd.read_csv("data/TX_Buyers_Shipment_Location_Data.csv")
data_test.head()

Unnamed: 0,REPORTER_NAME,REPORTER_ZIP,BUYER_NAME,BUYER_ZIP,DRUG_NAME,TRANSACTION_DATE,DOSAGE_UNIT,REPORTER_LAT,REPORTER_LONG,BUYER_LAT,BUYER_LONG
0,BELLCO DRUG CORP,11701,LONGHORN DRUG CO,75662,HYDROCODONE,2006-01-11,500.0,40.69,-73.41,32.4,-94.9
1,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-01-04,500.0,40.69,-73.41,30.1,-94.17
2,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2006-11-02,400.0,40.69,-73.41,30.1,-94.17
3,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-01-04,100.0,40.69,-73.41,30.1,-94.17
4,BELLCO DRUG CORP,11701,MALLEY'S PHARMACY,77706,HYDROCODONE,2007-02-14,100.0,40.69,-73.41,30.1,-94.17


In [102]:
data_test.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12108468 entries, 0 to 12108467
Data columns (total 11 columns):
REPORTER_NAME       12108468 non-null object
REPORTER_ZIP        12108468 non-null int64
BUYER_NAME          12108468 non-null object
BUYER_ZIP           12108468 non-null int64
DRUG_NAME           12108468 non-null object
TRANSACTION_DATE    12108468 non-null object
DOSAGE_UNIT         12108468 non-null float64
REPORTER_LAT        12108468 non-null float64
REPORTER_LONG       12108468 non-null float64
BUYER_LAT           12108468 non-null float64
BUYER_LONG          12108468 non-null float64
dtypes: float64(5), int64(2), object(4)
memory usage: 1016.2+ MB
