# IMPORT DATA 

imports, libraries, and options

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from geopy.geocoders import Nominatim, GoogleV3
from geopy.extra.rate_limiter import RateLimiter

pd.options.display.max_colwidth = 500

<br>
import dataframe, can immediately drop review column since this is not an NLP project

In [41]:
reviews= pd.read_csv('datasets/wine.csv').drop(columns= ['review'])
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323237 entries, 0 to 323236
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   wine         323237 non-null  object
 1   winery       323237 non-null  object
 2   category     323237 non-null  object
 3   designation  229692 non-null  object
 4   varietal     302190 non-null  object
 5   appellation  320630 non-null  object
 6   alcohol      282216 non-null  object
 7   price        299282 non-null  object
 8   rating       323237 non-null  int64 
 9   reviewer     251075 non-null  object
dtypes: int64(1), object(9)
memory usage: 24.7+ MB


<br>
drop duplicates

In [42]:
reviews.drop_duplicates(inplace= True)
reviews.info() # went from 323237 to 322916 wines

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322916 entries, 0 to 323236
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   wine         322916 non-null  object
 1   winery       322916 non-null  object
 2   category     322916 non-null  object
 3   designation  229411 non-null  object
 4   varietal     301874 non-null  object
 5   appellation  320311 non-null  object
 6   alcohol      281939 non-null  object
 7   price        298996 non-null  object
 8   rating       322916 non-null  int64 
 9   reviewer     250834 non-null  object
dtypes: int64(1), object(9)
memory usage: 27.1+ MB


<br>
remove any rows with null varietal (21042) and null appellation (2605) columns cause we don't know the grape or location

In [43]:
reviews['varietal'].isnull().sum(), reviews['appellation'].isnull().sum()

(21042, 2605)

In [44]:
reviews.dropna(subset= ['varietal', 'appellation'], inplace= True)
reviews['varietal'].isnull().sum(), reviews['appellation'].isnull().sum()

(0, 0)

<br>
remove any rows that contain the words 'Buy Now' (534) in the appellation and varietal columns due too scraping error

In [45]:
len(reviews.loc[reviews['appellation'].str.contains('Buy Now')])

534

In [46]:
# https://stackoverflow.com/questions/8305199/the-tilde-operator-in-python
# tilde operator: does the inverse of what we want
reviews= reviews[~reviews['appellation'].str.contains('Buy Now')]
len(reviews.loc[reviews['appellation'].str.contains('Buy Now')])

0

In [47]:
len(reviews.loc[reviews['varietal'].str.contains('Buy Now')])

70

In [48]:
reviews= reviews[~reviews['varietal'].str.contains('Buy Now')]
len(reviews.loc[reviews['varietal'].str.contains('Buy Now')])

0

<br>
remove winery name from wine to avoid issues with get_vintage function and improve readability/avoid repitition

In [49]:
# https://stackoverflow.com/questions/40121822/extracting-year-from-string-in-python
# needs to be function cause it will throw and error (None) if the the year doesn't exist
def get_vintage(wine_name):
    if not 'NV' in wine_name:   
        if re.search(r'\d{4}', wine_name) != None:
            return int(re.search(r'\d{4}', wine_name).group())
        else:
            return 'NV'
    else:
        return 'NV'

In [50]:
# https://stackoverflow.com/questions/47696401/replace-character-of-column-value-with-string-from-another-column-in-pandas
reviews['wine_no_winery'] = reviews.apply(lambda x:x['wine'].replace(x['winery'], ''), axis=1)

# uses above function to take out date from
reviews['vintage']= reviews['wine_no_winery'].apply(lambda x: get_vintage(x))

# https://stackoverflow.com/questions/47696401/replace-character-of-column-value-with-string-from-another-column-in-pandas
# removes vintage
reviews['wine_no_winery_vin'] = reviews.apply(lambda x:x['wine_no_winery'].replace(str(x['vintage']), ''), axis=1)

# https://stackoverflow.com/questions/14596884/remove-text-between-and
# removes appellation
reviews['name'] = reviews['wine_no_winery_vin'].apply(lambda x:re.sub("[\(\[].*?[\)\]]", "", x))

# drop extra columns
reviews.drop(columns= ['wine_no_winery', 'wine_no_winery_vin'], inplace= True)

# reorganize columns
reviews = reviews[['wine', 'name', 'vintage', 'category', 'varietal', 'winery', 'appellation', 'designation', 'alcohol', 'price', 'rating', 'reviewer']]

In [51]:
reviews

Unnamed: 0,wine,name,vintage,category,varietal,winery,appellation,designation,alcohol,price,rating,reviewer
0,J. Lohr 2000 Hilltop Vineyard Cabernet Sauvignon (Paso Robles),Hilltop Vineyard Cabernet Sauvignon,2000,Red,Cabernet Sauvignon,J. Lohr,"Paso Robles, Central Coast, California, US",Hilltop Vineyard,,$32,87,
1,Antucura 2010 Pinot Noir (Vista Flores),Pinot Noir,2010,Red,Pinot Noir,Antucura,"Vista Flores, Mendoza Province, Argentina",,15%,$17,85,Michael Schachner
2,Quinta do Portal 1999 Quinta do Portal Reserva Red (Douro),Reserva Red,1999,Red,"Red Blends, Red Blends",Quinta do Portal,"Douro, Portugal",Quinta do Portal Reserva,,,90,Roger Voss
3,Tenuta di Ghizzano 2006 Il Ghizzano Red (Toscana),Il Ghizzano Red,2006,Red,"Red Blends, Red Blends",Tenuta di Ghizzano,"Toscana, Tuscany, Italy",Il Ghizzano,13.5%,$18,86,
4,Tenuta San Francesco 2007 Tramonti White (Campania),Tramonti White,2007,White,White Blend,Tenuta San Francesco,"Campania, Southern Italy, Italy",Tramonti,13.5%,$21,85,
...,...,...,...,...,...,...,...,...,...,...,...,...
323232,Maddalena 2017 Rosé (Paso Robles),Rosé,2017,Rose,Rosé,Maddalena,"Paso Robles, Central Coast, California, US",,12.5%,$18,87,Matt Kettmann
323233,Toscolo 2015 Vernaccia di San Gimignano,Vernaccia di San Gimignano,2015,White,"Vernaccia, Italian White",Toscolo,"Vernaccia di San Gimignano, Tuscany, Italy",,12.5%,$11,87,Kerin O’Keefe
323234,Domaine G. Metz 2017 Pinot Blanc (Alsace),Pinot Blanc,2017,White,Pinot Blanc,Domaine G. Metz,"Alsace, Alsace, France",,13%,$20,90,Anne Krebiehl MW
323235,Huston 2019 Chicken Dinner Red (Idaho),Chicken Dinner Red,2019,Red,"Red Blends, Red Blends",Huston,"Idaho, Idaho, US",Chicken Dinner,14.3%,$18,87,Sean P. Sullivan


<br>
remove redundant grape varietal name to improve readability/avoid repitition

In [52]:
reviews['varietal'].unique();

In [53]:
conf_wines= [', Red Blends', ', Sparkling', ', Other Red', ', Other White', ', White Blend', ', Gewürztraminer', ]

for conf_wine in conf_wines:
    reviews['varietal']= reviews['varietal'].str.replace(conf_wine, '')

In [54]:
reviews

Unnamed: 0,wine,name,vintage,category,varietal,winery,appellation,designation,alcohol,price,rating,reviewer
0,J. Lohr 2000 Hilltop Vineyard Cabernet Sauvignon (Paso Robles),Hilltop Vineyard Cabernet Sauvignon,2000,Red,Cabernet Sauvignon,J. Lohr,"Paso Robles, Central Coast, California, US",Hilltop Vineyard,,$32,87,
1,Antucura 2010 Pinot Noir (Vista Flores),Pinot Noir,2010,Red,Pinot Noir,Antucura,"Vista Flores, Mendoza Province, Argentina",,15%,$17,85,Michael Schachner
2,Quinta do Portal 1999 Quinta do Portal Reserva Red (Douro),Reserva Red,1999,Red,Red Blends,Quinta do Portal,"Douro, Portugal",Quinta do Portal Reserva,,,90,Roger Voss
3,Tenuta di Ghizzano 2006 Il Ghizzano Red (Toscana),Il Ghizzano Red,2006,Red,Red Blends,Tenuta di Ghizzano,"Toscana, Tuscany, Italy",Il Ghizzano,13.5%,$18,86,
4,Tenuta San Francesco 2007 Tramonti White (Campania),Tramonti White,2007,White,White Blend,Tenuta San Francesco,"Campania, Southern Italy, Italy",Tramonti,13.5%,$21,85,
...,...,...,...,...,...,...,...,...,...,...,...,...
323232,Maddalena 2017 Rosé (Paso Robles),Rosé,2017,Rose,Rosé,Maddalena,"Paso Robles, Central Coast, California, US",,12.5%,$18,87,Matt Kettmann
323233,Toscolo 2015 Vernaccia di San Gimignano,Vernaccia di San Gimignano,2015,White,"Vernaccia, Italian White",Toscolo,"Vernaccia di San Gimignano, Tuscany, Italy",,12.5%,$11,87,Kerin O’Keefe
323234,Domaine G. Metz 2017 Pinot Blanc (Alsace),Pinot Blanc,2017,White,Pinot Blanc,Domaine G. Metz,"Alsace, Alsace, France",,13%,$20,90,Anne Krebiehl MW
323235,Huston 2019 Chicken Dinner Red (Idaho),Chicken Dinner Red,2019,Red,Red Blends,Huston,"Idaho, Idaho, US",Chicken Dinner,14.3%,$18,87,Sean P. Sullivan


<br>
for this specific project we are looking for wines that are <br>
- red, white, rose <br>
- only grown in california, oregon, washington <br>
- do not have NV as vintage <br>

In [55]:
# df with only red, white, rose categories
rwr_reviews = reviews[(reviews['category'] == 'White') | (reviews['category'] == 'Red') | (reviews['category'] == 'Rose')]
rwr_reviews.shape

(277870, 12)

In [56]:
# confirm only wines that are red, white, rose are selected
rwr_reviews['category'].unique()

array(['Red', 'White', 'Rose'], dtype=object)

In [57]:
# df with only california, oregon, washington
w_us_rwr_reviews= rwr_reviews.loc[(rwr_reviews['appellation'].str.contains('California')) | (rwr_reviews['appellation'].str.contains('Oregon')) | (rwr_reviews['appellation'].str.contains('Washington'))]
w_us_rwr_reviews.shape

(113083, 12)

In [58]:
w_us_rwr_reviews

Unnamed: 0,wine,name,vintage,category,varietal,winery,appellation,designation,alcohol,price,rating,reviewer
0,J. Lohr 2000 Hilltop Vineyard Cabernet Sauvignon (Paso Robles),Hilltop Vineyard Cabernet Sauvignon,2000,Red,Cabernet Sauvignon,J. Lohr,"Paso Robles, Central Coast, California, US",Hilltop Vineyard,,$32,87,
5,Las Positas 2011 Estate Barbera (Livermore Valley),Estate Barbera,2011,Red,Barbera,Las Positas,"Livermore Valley, Central Coast, California, US",Estate,15.1%,$40,89,Virginie Boone
6,Krupp Brothers 2007 The Doctor Red (Napa Valley),The Doctor Red,2007,Red,Red Blends,Krupp Brothers,"Napa Valley, Napa, California, US",The Doctor,15.1%,$60,92,
10,Merry Edwards 2011 Sauvignon Blanc (Russian River Valley),Sauvignon Blanc,2011,White,Sauvignon Blanc,Merry Edwards,"Russian River Valley, Sonoma, California, US",,14.1%,$32,88,
17,Daou 2015 Reserve Cabernet Sauvignon (Paso Robles),Reserve Cabernet Sauvignon,2015,Red,Cabernet Sauvignon,Daou,"Paso Robles, Central Coast, California, US",Reserve,14.5%,$56,92,Matt Kettmann
...,...,...,...,...,...,...,...,...,...,...,...,...
323212,Savage Grace 2018 Copeland Vineyard Cabernet Franc (Rattlesnake Hills),Copeland Vineyard Cabernet Franc,2018,Red,Cabernet Franc,Savage Grace,"Rattlesnake Hills, Columbia Valley, Washington, US",Copeland Vineyard,13%,$30,92,Sean P. Sullivan
323213,Fortino 2017 Cabernet Sauvignon (Santa Clara Valley),Cabernet Sauvignon,2017,Red,Cabernet Sauvignon,Fortino,"Santa Clara Valley, Central Coast, California, US",,14%,$30,88,Matt Kettmann
323215,Villicana 2010 Estate Grown Merlot (Paso Robles),Estate Grown Merlot,2010,Red,Merlot,Villicana,"Paso Robles, Central Coast, California, US",Estate Grown,15%,$35,89,
323223,Grgich Hills 2001 Private Reserve Style Fumé Blanc (Napa Valley),Private Reserve Style Fumé Blanc,2001,White,"Fumé Blanc, Sauvignon Blanc",Grgich Hills,"Napa Valley, Napa, California, US",Private Reserve Style,,$18,85,


In [59]:
w_us_rwr_reviews['vintage'].unique()

array([2000, 2011, 2007, 2015, 1999, 2002, 2018, 2012, 2006, 2005, 2009,
       2010, 2013, 2003, 2001, 2017, 2004, 2019, 2016, 'NV', 2008, 2020,
       1997, 2014, 1998, 1996, 1995, 1994, 1985, 1988, 1993, 1992, 1986,
       1990, 1991, 1980, 1989, 1983, 1987, 1978], dtype=object)

In [60]:
# df with no NV vintage
w_us_rwr_year_reviews= w_us_rwr_reviews.loc[w_us_rwr_reviews['vintage'] != 'NV']
w_us_rwr_year_reviews.shape

(112267, 12)

<br>
reset index and rename to project_df for future work

In [61]:
# reset index
w_us_rwr_year_reviews.reset_index(inplace= True, drop= True)

# remane as project_df
project= w_us_rwr_year_reviews

# total number of wines: 112_267
# project.shape[0]

In [62]:
project

Unnamed: 0,wine,name,vintage,category,varietal,winery,appellation,designation,alcohol,price,rating,reviewer
0,J. Lohr 2000 Hilltop Vineyard Cabernet Sauvignon (Paso Robles),Hilltop Vineyard Cabernet Sauvignon,2000,Red,Cabernet Sauvignon,J. Lohr,"Paso Robles, Central Coast, California, US",Hilltop Vineyard,,$32,87,
1,Las Positas 2011 Estate Barbera (Livermore Valley),Estate Barbera,2011,Red,Barbera,Las Positas,"Livermore Valley, Central Coast, California, US",Estate,15.1%,$40,89,Virginie Boone
2,Krupp Brothers 2007 The Doctor Red (Napa Valley),The Doctor Red,2007,Red,Red Blends,Krupp Brothers,"Napa Valley, Napa, California, US",The Doctor,15.1%,$60,92,
3,Merry Edwards 2011 Sauvignon Blanc (Russian River Valley),Sauvignon Blanc,2011,White,Sauvignon Blanc,Merry Edwards,"Russian River Valley, Sonoma, California, US",,14.1%,$32,88,
4,Daou 2015 Reserve Cabernet Sauvignon (Paso Robles),Reserve Cabernet Sauvignon,2015,Red,Cabernet Sauvignon,Daou,"Paso Robles, Central Coast, California, US",Reserve,14.5%,$56,92,Matt Kettmann
...,...,...,...,...,...,...,...,...,...,...,...,...
112262,Savage Grace 2018 Copeland Vineyard Cabernet Franc (Rattlesnake Hills),Copeland Vineyard Cabernet Franc,2018,Red,Cabernet Franc,Savage Grace,"Rattlesnake Hills, Columbia Valley, Washington, US",Copeland Vineyard,13%,$30,92,Sean P. Sullivan
112263,Fortino 2017 Cabernet Sauvignon (Santa Clara Valley),Cabernet Sauvignon,2017,Red,Cabernet Sauvignon,Fortino,"Santa Clara Valley, Central Coast, California, US",,14%,$30,88,Matt Kettmann
112264,Villicana 2010 Estate Grown Merlot (Paso Robles),Estate Grown Merlot,2010,Red,Merlot,Villicana,"Paso Robles, Central Coast, California, US",Estate Grown,15%,$35,89,
112265,Grgich Hills 2001 Private Reserve Style Fumé Blanc (Napa Valley),Private Reserve Style Fumé Blanc,2001,White,"Fumé Blanc, Sauvignon Blanc",Grgich Hills,"Napa Valley, Napa, California, US",Private Reserve Style,,$18,85,


# GEOCODING

need to find locations, latitudes, and longitudes of all 279 appellations (which will be easier to locate since there are no addresses for the wineries), so make new dataframe of only appellations to geocode

In [24]:
appellations= project[['appellation']].copy()

# dropduplicates and reset index
appellations.drop_duplicates(inplace= True)
appellations.reset_index(inplace= True, drop= True)

# number of appellations: 279
# appellations.shape[0]

<br>
make locations df of the appellations split into your columns of ava, region, state, country

In [25]:
locations= appellations['appellation'].str.split(', ', expand=True)
locations.rename(columns= {0: 'ava', 1: 'region', 2:'state', 3:'country'}, inplace=True)

<br>
loop through appellation column to see if they exist in the west_coast_avas list, and if they don't, they get removed. this is important because having hypenated or vague appelations can't be used to determine the specific location/climate of a wine

In [26]:
west_coast_avas= ['Applegate Valley', 'Chehalem Mountains', 'Columbia Gorge', 'Columbia Valley', 'Dundee Hills','Elkton Oregon', 'Eola-Amity Hills', 
                  'Laurelwood District', 'Lower Long Tom', 'McMinnville', 'Red Hill Douglas County, Oregon', 'Ribbon Ridge', 
                  'The Rocks District of Milton-Freewater', 'Rogue Valley', 'Snake River Valley', 'Southern Oregon', 'Tualatin Hills', 'Umpqua Valley', 
                  'Van Duzer Corridor', 'Walla Walla Valley', 'Willamette Valley', 'Yamhill-Carlton', 'Ancient Lakes of the Columbia Valley', 'Candy Mountain',
                  'Columbia Gorge', 'Columbia Valley', 'Goose Gap', 'Horse Heaven Hills', 'Lake Chelan', 'Lewis-Clark Valley', 'Naches Heights', 
                  'Puget Sound', 'Rattlesnake Hills', 'Red Mountain', 'Royal Slope', 'Snipes Mountain', 'The Burn of Columbia Valley', 'Wahluke Slope',
                  'Walla Walla Valley', 'White Bluffs', 'Yakima Valley', 'Inwood Valley', 'Manton Valley', 'Adelaida District', 'Alisos Canyon', 
                  'Arroyo Grande Valley', 'Arroyo Seco', 'Ballard Canyon', 'Ben Lomond Mountain', 'Carmel Valley', 'Central Coast', 'Chalone', 'Cienega Valley', 
                  'Creston District', 'Edna Valley', 'El Pomar District', 'Hames Valley', 'Happy Canyon of Santa Barbara', 'Lamorinda', 'Lime Kiln Valley',
                  'Livermore Valley', 'Los Olivos District', 'Monterey', 'Mt. Harlan', 'Pacheco Pass', 'Paicines', 'Paso Robles', 'Paso Robles Estrella District',
                  'Paso Robles Geneseo District', 'Paso Robles Highlands District', 'Paso Robles Willow Creek District', 'San Antonio Valley', 'San Benito',
                  'San Bernabe', 'San Francisco Bay', 'San Juan Creek', 'San Lucas', 'San Miguel District', 'San Ysidro District', 'Santa Clara Valley',
                  'Santa Cruz Mountains', 'Santa Lucia Highlands', 'Santa Margarita Ranch', 'Santa Maria Valley', 'Santa Ynez Valley', 'Sta. Rita Hills',
                  'Templeton Gap District', 'York Mountain', 'Alta Mesa', 'Borden Ranch', 'Capay Valley', 'Clarksburg', 'Clements Hills', 'Cosumnes River',
                  'Diablo Grande', 'Dunnigan Hills', 'Jahant', 'Lodi', 'Madera', 'Merritt Island', 'Mokelumne River', 'River Junction', 'Salado Creek',
                  'Sloughhouse', 'Squaw Valley-Miramonte', 'Tracy Hills', 'Seiad Valley', 'Trinity Lakes', 'Willow Creek', 'Alexander Valley', 'Anderson Valley',
                  'Atlas Peak', 'Benmore Valley', 'Bennett Valley', 'Big Valley District-Lake County', 'Calistoga', 'Chalk Hill', 'Chiles Valley', 'Clear Lake',
                  'Cole Ranch', 'Coombsville', 'Covelo', 'Diamond Mountain District', 'Dos Rios', 'Dry Creek Valley', 'Eagle Peak Mendocino County', 
                  'Fort Ross-Seaview', 'Fountaingrove District', 'Green Valley of Russian River Valley', 'Guenoc Valley', 'High Valley', 'Howell Mountain',
                  'Kelsey Bench-Lake County', 'Knights Valley', 'Los Carneros', 'McDowell Valley', 'Mendocino', 'Mendocino Ridge', 
                  'Moon Mountain District Sonoma County', 'Mt. Veeder', 'Napa Valley', 'North Coast', 'Northern Sonoma', 'Oak Knoll District of Napa Valley',
                  'Oakville', 'Petaluma Gap', 'Pine Mountain-Cloverdale Peak', 'Potter Valley', 'Red Hills Lake County', 'Redwood Valley', 'Rockpile', 
                  'Russian River Valley', 'Rutherford', 'Solano County Green Valley', 'Sonoma Coast', 'Sonoma Mountain', 'Sonoma Valley', 
                  'Spring Mountain District', 'St. Helena', 'Stags Leap District', 'Suisun Valley', 'Wild Horse Valley', 'Yorkville Highlands', 'Yountville',
                  'California Shenandoah Valley', 'El Dorado', 'Fair Play', 'Fiddletown', 'North Yuba', 'Sierra Foothills', 
                  'Antelope Valley of the California High Desert', 'Cucamonga Valley', 'Leona Valley', 'Malibu Coast', 'Malibu-Newton Canyon', 'Ramona Valley',
                  'Palos Verdes Peninsula', 'Saddle Rock-Malibu', 'San Pasqual Valley', 'Sierra Pelona Valley', 'South Coast', 'Tehachapi Mountains', 
                  'Temecula Valley']                  

<br>
merge appellations df and locations df on index then clean up the table and drop regions that contain vague key words in the 'region' or 'ava' columns because those regions also have unclear ava associations

In [27]:
appellations= appellations.merge(right= locations, how= 'left', left_index= True, right_index= True)

# drop nulls which indicate appellations without the a complete ava, region, state, country split and could be misleading about appellation locations
appellations.dropna(inplace= True)

# creates new appellations df that only includes appellations affiliated with west coast avas
# https://www.codegrepper.com/code-examples/python/pandas+check+if+value+in+column+is+in+a+list
appellations= appellations[appellations['ava'].isin(west_coast_avas)]

# new column 'location' of 'ava' + 'state' + 'country'
appellations['location']= appellations['ava'] + ', ' + appellations['state'] + ', ' + appellations['country']

# make new appellations df of all appellations that don't contain vague key words in the 'region' or 'ava' columns 
# 8_602 wines have this "Other' appelation so dropping them from 112_267 wines only loses 7.66% of the wines
appellations= appellations[~appellations['region'].str.contains('Other')]

# 732 wines have an ava of 'North Coast' so dropping them from 112_267 wines only loses 0.65% of the wines
appellations= appellations[~appellations['ava'].str.contains('North Coast')]
# 1_744 wines have an ava of 'Central Coast' so dropping them from over 112_267 wines only loses 1.55% of the wines
appellations= appellations[~appellations['ava'].str.contains('Central Coast')]
# 101 wines have an ava of 'Central Coast' so dropping them from over 112_267 wines only loses 0.09% of the wines
appellations= appellations[~appellations['ava'].str.contains('South Coast')]
# updated number of wines: 101_088

# drop redundant columns and reset index
appellations.drop(columns= ['ava', 'region', 'state', 'country'], inplace= True)
appellations.reset_index(inplace= True, drop= True)

# updated number of appellations: 138
# appellations.shape[0]

<br>
all appellation locations where the geocoder can't or incorrectly get the geocode, lat, long; this cell replaces the incorrect part of the appellation with a town that is close to the general location of the ava

In [28]:
# use https://www.ttb.gov/images/AVA/ to find closes towns to these non-address-named avas
appellations['location']= appellations['location'].str.replace('Santa Lucia Highlands', 'Gonzales')
appellations['location']= appellations['location'].str.replace('Sta. Rita Hills', 'Lompoc')
appellations['location']= appellations['location'].str.replace('Stags Leap District', 'Yountville')
appellations['location']= appellations['location'].str.replace('Paso Robles Willow Creek District', 'Paso Robles')
appellations['location']= appellations['location'].str.replace('Paso Robles Highlands District', 'Paso Robles')
appellations['location']= appellations['location'].str.replace('Dunnigan Hills', 'Dunnigan')
appellations['location']= appellations['location'].str.replace('Moon Mountain District Sonoma County', 'Sonoma')
appellations['location']= appellations['location'].str.replace('El Pomar District', 'Paso Robles')
appellations['location']= appellations['location'].str.replace('Yorkville Highlands', 'Yorkville')
appellations['location']= appellations['location'].str.replace('Templeton Gap District', 'Templeton')
appellations['location']= appellations['location'].str.replace('Happy Canyon of Santa Barbara', 'Cachuma Village')
appellations['location']= appellations['location'].str.replace('San Antonio Valley', 'Lockwood')
appellations['location']= appellations['location'].str.replace('Clements Hills', 'Clements')
appellations['location']= appellations['location'].str.replace('Adelaida District', 'Paso Robles')
appellations['location']= appellations['location'].str.replace('Applegate Valley,', 'Provolt')
appellations['location']= appellations['location'].str.replace('Los Olivos District', 'Los Olivos')
appellations['location']= appellations['location'].str.replace('Lime Kiln Valley', 'Paicines')
appellations['location']= appellations['location'].str.replace('Sierra Pelona Valley', 'Santa Clarita')
appellations['location']= appellations['location'].str.replace('San Miguel District', 'San Miguel')
appellations['location']= appellations['location'].str.replace('Fountaingrove District', 'Santa Rosa')
appellations['location']= appellations['location'].str.replace('Malibu-Newton Canyon', 'Malibu')
appellations['location']= appellations['location'].str.replace('Paso Robles Geneseo District', 'Paso Robles')
appellations['location']= appellations['location'].str.replace('Paso Robles Estrella District', 'Paso Robles')
appellations['location']= appellations['location'].str.replace('Pine Mountain-Cloverdale Peak', 'Cloverdale')
appellations['location']= appellations['location'].str.replace('Antelope Valley of the California High Desert', 'Palmdale')
appellations['location']= appellations['location'].str.replace('Solano County Green Valley', 'Green Valley')
appellations['location']= appellations['location'].str.replace('Saddle Rock-Malibu', 'Malibu')
appellations['location']= appellations['location'].str.replace('San Ysidro District', 'Gilroy')
appellations['location']= appellations['location'].str.replace('Spring Mountain District', 'St. Helena')
appellations['location']= appellations['location'].str.replace('Petaluma Gap', 'Petaluma')
appellations['location']= appellations['location'].str.replace('Santa Maria Valley', 'Garey')
appellations['location']= appellations['location'].str.replace('Chalone', 'Soledad')
appellations['location']= appellations['location'].str.replace('Diamond Mountain District', 'Calistoga')
appellations['location']= appellations['location'].str.replace('Ballard Canyon', 'Ballard')
appellations['location']= appellations['location'].str.replace('Knights Valley', 'Kellog')
appellations['location']= appellations['location'].str.replace('Rockpile', 'Oak Knolls')
appellations['location']= appellations['location'].str.replace('River Junction', 'Manteca')
appellations['location']= appellations['location'].str.replace('Northern Sonoma', 'Geyserville')
appellations['location']= appellations['location'].str.replace('San Bernabe', 'King City')
appellations['location']= appellations['location'].str.replace('Mendocino Ridge', 'Gualala Mountain')
appellations['location']= appellations['location'].str.replace('Southern Oregon', 'Grants Pass')
appellations['location']= appellations['location'].str.replace('Cole Ranch', 'El Roble')

  appellations['location']= appellations['location'].str.replace('Sta. Rita Hills', 'Lompoc')


<br> 
use the Nominatim (or whichever you choose) class in the geopy.geocoders import to get geocode, latitude, longitude of a particular location and add to appellations df

In [29]:
# https://www.askpython.com/python/python-geopy-to-find-geocode-of-an-address
# create an instance of Nominatim class
geolocator= Nominatim(user_agent="example")
 
# applying the rate limiter wrapper
geocode= RateLimiter(geolocator.geocode, min_delay_seconds= 1)

# add 'geocode', 'lat', 'long' columns to appellation df as a result of the geolocator
appellations['geocode']= appellations['location'].apply(geocode)
appellations['lat']= appellations['geocode'].apply(lambda x: x.latitude if x else None)
appellations['long']= appellations['geocode'].apply(lambda x: x.longitude if x else None)

# uncomment to confirm if geocoder worked, where success is 0 for each column
# appellations.isnull().sum()
# appellations[appellations.isna().any(axis=1)]

In [35]:
appellations

Unnamed: 0,appellation,location,geocode,lat,long
0,"Paso Robles, Central Coast, California, US","Paso Robles, California, US","(Paso Robles, San Luis Obispo County, California, 93447, United States, (35.6267654, -120.6912456))",35.626765,-120.691246
1,"Livermore Valley, Central Coast, California, US","Livermore Valley, California, US","(Livermore Street, Apple Valley, San Bernardino County, California, United States, (34.5881065, -117.1859148))",34.588107,-117.185915
2,"Napa Valley, Napa, California, US","Napa Valley, California, US","(Bothe-Napa Valley State Park, Napa County, California, United States, (38.540524149999996, -122.54835743348914))",38.540524,-122.548357
3,"Russian River Valley, Sonoma, California, US","Russian River Valley, California, US","(Russian River, Redwood Valley, Mendocino County, California, 95418, United States, (39.2807189, -123.2098085))",39.280719,-123.209808
4,"Arroyo Grande Valley, Central Coast, California, US","Arroyo Grande Valley, California, US","(Arroyo Grande Valley, Arroyo Grande, San Luis Obispo County, California, 93420, United States, (35.1583532, -120.5308411))",35.158353,-120.530841
...,...,...,...,...,...
133,"Creston District, Central Coast, California, US","Creston District, California, US","(Creston Street, Irvington District, Fremont, Alameda County, California, 94538, United States, (37.5229514, -121.9741587))",37.522951,-121.974159
134,"Hames Valley, Central Coast, California, US","Hames Valley, California, US","(Hames Valley Lane, Antioch, Contra Costa County, California, 94531, United States, (37.9510745, -121.7530292))",37.951074,-121.753029
135,"San Ysidro District, Central Coast, California, US","Gilroy, California, US","(Gilroy, Santa Clara County, California, 95020-5205, United States, (37.0065078, -121.5631723))",37.006508,-121.563172
136,"Tracy Hills, Central Valley, California, US","Tracy Hills, California, US","(Tracy Court, Los Altos Hills, Santa Clara County, California, 94022-1919, United States, (37.3851265, -122.1699602))",37.385126,-122.169960


<br>
save appellations to pkl to export to new notebook

In [70]:
appellations.to_pickle('datasets/appellations.pkl')

<br>
overwrite project df to add all appellations to orginal project df, which will remove wines that had unclear ava associations

In [63]:
project.shape

(112267, 12)

In [64]:
project= project.copy().merge(right= appellations, how= 'right', on= 'appellation')

# updated number of wines: 76_462
# project.shape[0]

(76462, 16)

<br>
save project df to pkl

In [72]:
project.to_pickle('datasets/project.pkl')