In [1]:
# dependency
import pandas as pd
from sqlalchemy import create_engine
import json
from re import search

In [2]:
from config import postgrepass

In [3]:
# Create connection to DB
db_path = f'postgresql://postgres:{postgrepass}@localhost:5432/new_word_study'
engine = create_engine(db_path)
conn = engine.connect()

In [4]:
# read data from csv file to df
resorts_df = pd.read_csv('static/data/resorts.csv')
# fillna with unknown value for future manipulations
resorts_df.fillna('unknown')
resorts_df.head()

Unnamed: 0.1,Unnamed: 0,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len
0,0,Vail,https://www.skiresort.info/ski-resort/vail/,219.0,Vail,Colorado,234.0,57.0,84.0,93.0
1,1,Telluride,https://www.skiresort.info/ski-resort/telluride/,169.0,Mountain Village,Colorado,88.2,6.7,51.7,29.8
2,2,Beaver Creek,https://www.skiresort.info/ski-resort/beaver-c...,218.0,Beaver Creek Village,Colorado,150.0,28.5,64.5,57.0
3,3,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,209.0,Mammoth Lakes,Mammoth Lakes,89.8,18.4,48.5,22.9
4,4,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,199.0,Snowmass Village,Aspen Snowmass,237.0,12.0,114.0,111.0


In [5]:
# with part should be moved to scraping.ipynb
names = list(resorts_df['name'])
newNames = []

for ind in range(len(names)):
    newNames.append(names[ind].strip())


In [6]:
# add name without space to df
resorts_df['newNames'] = newNames
resorts_df.head()

Unnamed: 0.1,Unnamed: 0,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len,newNames
0,0,Vail,https://www.skiresort.info/ski-resort/vail/,219.0,Vail,Colorado,234.0,57.0,84.0,93.0,Vail
1,1,Telluride,https://www.skiresort.info/ski-resort/telluride/,169.0,Mountain Village,Colorado,88.2,6.7,51.7,29.8,Telluride
2,2,Beaver Creek,https://www.skiresort.info/ski-resort/beaver-c...,218.0,Beaver Creek Village,Colorado,150.0,28.5,64.5,57.0,Beaver Creek
3,3,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,209.0,Mammoth Lakes,Mammoth Lakes,89.8,18.4,48.5,22.9,Mammoth Mountain
4,4,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,199.0,Snowmass Village,Aspen Snowmass,237.0,12.0,114.0,111.0,Snowmass


In [7]:
resorts_df.columns

Index(['Unnamed: 0', 'name', 'link', 'price', 'closest_town', 'region',
       'total_len', 'easy_len', 'intermediate_len', 'difficult_len',
       'newNames'],
      dtype='object')

In [8]:
# remove extra column and reorder columns
resorts_df.drop(columns=['Unnamed: 0', 'name'], inplace=True)
resorts_df.rename(columns={'newNames': 'name'}, inplace=True)
resorts_df = resorts_df[['name', 'link', 'price', 'closest_town', 'region', 'total_len', 'easy_len'                                     ,'intermediate_len', 'difficult_len']]
resorts_df.head()

Unnamed: 0,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len
0,Vail,https://www.skiresort.info/ski-resort/vail/,219.0,Vail,Colorado,234.0,57.0,84.0,93.0
1,Telluride,https://www.skiresort.info/ski-resort/telluride/,169.0,Mountain Village,Colorado,88.2,6.7,51.7,29.8
2,Beaver Creek,https://www.skiresort.info/ski-resort/beaver-c...,218.0,Beaver Creek Village,Colorado,150.0,28.5,64.5,57.0
3,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,209.0,Mammoth Lakes,Mammoth Lakes,89.8,18.4,48.5,22.9
4,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,199.0,Snowmass Village,Aspen Snowmass,237.0,12.0,114.0,111.0


In [9]:
# read data from ski_area.geojson 
f = open('static/data/ski_areas.geojson', encoding="utf-8")
data = json.load(f)
locations = []

for i in range(len(data['features'])):
    country = '' 
    if data['features'][i]['properties']['location'] is not None:
        country = data['features'][i]['properties']['location']['iso3166_1Alpha2']

    name = ""
    if data['features'][i]['properties']['name'] is not None:
        name = data['features'][i]['properties']['name']

    # exclude crosscountry from data
    croscountry = 'Nordic'
    if (croscountry not in name) and (country == 'US'):
        locations.append({'name': name
                            ,'geometry': data['features'][i]['geometry']['coordinates']
                            , 'state': data['features'][i]['properties']['location']['localized']['en']['region']
                            , 'website': data['features'][i]['properties']['website']})

f.close()


In [10]:
#  upload locations to df
locations_df = pd.DataFrame(locations)
locations_df = locations_df.fillna('Unknown')
locations_df.head()

Unnamed: 0,name,geometry,state,website
0,Summit Ranger District Ski Trails,"[-119.94166461319604, 38.181344668241906]",California,https://www.pinecrestnordic.org/
1,Mount Pinos Winter Sports Area,"[-119.11840321868132, 34.81349084560443]",California,https://www.nordicbase.org/
2,Great Brook Ski Touring Center,"[-71.34426978240515, 42.5568498331646]",Unknown,http://www.greatbrookski.com
3,Pajarito Mountain,"[-106.39205836478088, 35.89077152616687]",New Mexico,Unknown
4,Ski Apache,"[-105.80167005041056, 33.39417520853858]",New Mexico,Unknown


In [11]:
# merge two df in one
full_resort_data_df = pd.merge(resorts_df, locations_df, on='name', how='left')
full_resort_data_df.head()

Unnamed: 0,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len,geometry,state,website
0,Vail,https://www.skiresort.info/ski-resort/vail/,219.0,Vail,Colorado,234.0,57.0,84.0,93.0,"[-106.35805314076724, 39.60991962505997]",Unknown,http://www.vail.snow.com
1,Telluride,https://www.skiresort.info/ski-resort/telluride/,169.0,Mountain Village,Colorado,88.2,6.7,51.7,29.8,"[-107.83598933345762, 37.92046244915704]",Colorado,http://tellurideskiresort.com/
2,Beaver Creek,https://www.skiresort.info/ski-resort/beaver-c...,218.0,Beaver Creek Village,Colorado,150.0,28.5,64.5,57.0,,,
3,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,209.0,Mammoth Lakes,Mammoth Lakes,89.8,18.4,48.5,22.9,"[-119.02132555986691, 37.64096495905402]",California,http://www.mammothmountain.com/
4,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,199.0,Snowmass Village,Aspen Snowmass,237.0,12.0,114.0,111.0,"[-106.95106028636194, 39.18914819590207]",Colorado,http://www.aspensnowmass.com


In [12]:
#  fillna with 'Empty' for future manipulations

full_resort_data_df = full_resort_data_df.fillna('Empty')

In [13]:
full_resort_data_df

Unnamed: 0,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len,geometry,state,website
0,Vail,https://www.skiresort.info/ski-resort/vail/,219,Vail,Colorado,234,57,84,93,"[-106.35805314076724, 39.60991962505997]",Unknown,http://www.vail.snow.com
1,Telluride,https://www.skiresort.info/ski-resort/telluride/,169,Mountain Village,Colorado,88.2,6.7,51.7,29.8,"[-107.83598933345762, 37.92046244915704]",Colorado,http://tellurideskiresort.com/
2,Beaver Creek,https://www.skiresort.info/ski-resort/beaver-c...,218,Beaver Creek Village,Colorado,150,28.5,64.5,57,Empty,Empty,Empty
3,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,209,Mammoth Lakes,Mammoth Lakes,89.8,18.4,48.5,22.9,"[-119.02132555986691, 37.64096495905402]",California,http://www.mammothmountain.com/
4,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,199,Snowmass Village,Aspen Snowmass,237,12,114,111,"[-106.95106028636194, 39.18914819590207]",Colorado,http://www.aspensnowmass.com
...,...,...,...,...,...,...,...,...,...,...,...,...
527,Sierra Summit,https://www.skiresort.info/ski-resort/sierra-s...,Empty,Empty,info@skiresort-service.com,Empty,Empty,Empty,Empty,Empty,Empty,Empty
528,Lynx Creek,https://www.skiresort.info/ski-resort/lynx-creek/,Empty,Empty,info@skiresort-service.com,Empty,Empty,Empty,Empty,"[-71.68203592300397, 43.745056913683]",New Hampshire,
529,Fairmont Hot Springs – Fairmont,https://www.skiresort.info/ski-resort/fairmont...,Empty,Empty,info@skiresort-service.com,Empty,Empty,Empty,Empty,Empty,Empty,Empty
530,Plumtree Ski Hill (Fawnridge),https://www.skiresort.info/ski-resort/plumtree...,Empty,Empty,info@skiresort-service.com,Empty,Empty,Empty,Empty,Empty,Empty,Empty


In [14]:
# Set up connection to DB
db_path = f'postgresql://postgres:{postgrepass}@localhost:5432/SkiResorts'
engine = create_engine(db_path)
conn = engine.connect()


In [15]:
# Upload data to DB 
full_resort_data_df.to_sql('resorts_info', conn, if_exists='replace')
locations_df.to_sql('locations', conn, if_exists='replace')


In [19]:
# extract state for Unknown value

resorts_info_df = pd.read_sql("SELECT * FROM resorts_info ORDER BY closest_town", conn)
resorts_info_df.head()

Unnamed: 0,index,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len,geometry,state,website
0,402,Andes Tower Hills,https://www.skiresort.info/ski-resort/andes-to...,45.0,Alexandria,Minnesota,4.0,2.2,1.6,0.2,"{-95.6314374785965,45.84927221789472}",Minnesota,https://www.andestowerhills.com/
1,403,Andes Tower Hills,https://www.skiresort.info/ski-resort/andes-to...,45.0,Alexandria,Minnesota,4.0,2.2,1.6,0.2,"{-95.632209777832,45.849608274802}",Minnesota,Unknown
2,199,Bear Creek Mountain Resort,https://www.skiresort.info/ski-resort/bear-cre...,60.0,Allentown,Pennsylvania,10.0,3.0,4.0,3.0,"{-75.63017408235291,40.47496905588234}",Unknown,Unknown
3,305,Blue Mountain,https://www.skiresort.info/ski-resort/blue-mou...,65.0,Allentown,Pennsylvania,18.0,10.0,6.0,2.0,"{-109.43329095840002,37.86220832212802}",Utah,
4,409,Big Boulder,https://www.skiresort.info/ski-resort/big-boul...,65.0,Allentown,Pennsylvania,4.0,3.0,1.0,0.0,"{-75.60104986984125,41.04715227936506}",Unknown,https://jfbb.com/


In [22]:
state_unknown = pd.read_sql("SELECT * FROM resorts_info WHERE state = 'Unknown'", conn)
state_unknown.head()

Unnamed: 0,index,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len,geometry,state,website
0,0,Vail,https://www.skiresort.info/ski-resort/vail/,219.0,Vail,Colorado,234.0,57.0,84.0,93.0,"{-106.35805314076724,39.60991962505997}",Unknown,http://www.vail.snow.com
1,20,Buttermilk Mountain,https://www.skiresort.info/ski-resort/buttermi...,199.0,Aspen,Aspen Snowmass,34.0,11.9,13.3,8.8,"{-106.8698646901408,39.1908717267606}",Unknown,http://adf.ly/1UwDWa
2,28,Yellowstone Club,https://www.skiresort.info/ski-resort/yellowst...,Empty,Yellowstone Club,Montana,80.0,30.0,30.0,20.0,"{-111.41113042831006,45.24162672369199}",Unknown,http://www.theyellowstoneclub.com
3,31,Schweitzer Mountain Resort,https://www.skiresort.info/ski-resort/schweitz...,95.0,Sandpoint,Idaho,95.0,20.0,40.0,35.0,"{-116.62285366870098,48.38072336697773}",Unknown,http://www.schweitzer.com/
4,62,Gore Mountain,https://www.skiresort.info/ski-resort/gore-mou...,94.0,North Creek,The Adirondacks,58.2,6.7,28.2,23.3,"{-74.02378163740998,43.67956294100719}",Unknown,http://www.goremountain.com


In [27]:
state_known = pd.read_sql("SELECT * FROM resorts_info WHERE state NOT LIKE 'Unknown'", conn)
state_known.head()

Unnamed: 0,index,name,link,price,closest_town,region,total_len,easy_len,intermediate_len,difficult_len,geometry,state,website
0,1,Telluride,https://www.skiresort.info/ski-resort/telluride/,169.0,Mountain Village,Colorado,88.2,6.7,51.7,29.8,"{-107.83598933345762,37.92046244915704}",Colorado,http://tellurideskiresort.com/
1,3,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,209.0,Mammoth Lakes,Mammoth Lakes,89.8,18.4,48.5,22.9,"{-119.02132555986691,37.64096495905402}",California,http://www.mammothmountain.com/
2,4,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,199.0,Snowmass Village,Aspen Snowmass,237.0,12.0,114.0,111.0,"{-106.95106028636194,39.18914819590207}",Colorado,http://www.aspensnowmass.com
3,7,Killington,https://www.skiresort.info/ski-resort/killington/,165.0,Killington,Vermont,117.0,19.9,46.7,50.2,"{-72.79985765343031,43.61034153060775}",Vermont,http://www.killington.com
4,10,Winter Park Resort,https://www.skiresort.info/ski-resort/winter-p...,164.0,Winter Park,Colorado,143.0,11.0,53.0,79.0,"{-105.7766544513478,39.871873051869514}",Colorado,http://www.winterparkresort.com/


In [44]:
for ind in state_unknown.index:
    town = state_unknown['closest_town'][ind]
    region = state_unknown['region'][ind]
    temp_index = 0
    if state_known.loc[state_known['closest_town'] == town] and state_known.loc[state_known['region'] == region]:
        temp_index = state_known[state_known['closest_town'] == town].index
        state_unknown['state'][ind] = state_known['state'][temp_index]

    
state_unknown.head()


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [48]:
one_town = resorts_info_df[resorts_info_df['closest_town'] == 'Allentown']
state = one_town[one_town['state'] != 'Unknown']
state_name = state['state']
state_name

3    Utah
Name: state, dtype: object

In [53]:
test = resorts_info_df[resorts_info_df['closest_town'] == 'Allentown']
test
test['index'][2]

199

In [56]:
for each_town in state_unknown["closest_town"]:
    town = resorts_info_df[resorts_info_df['closest_town'] == each_town]
    # print(town)
    state = town[town['state'] != 'Unknown']
    # print(state)
    state_name = state['state']
    print(state_name)
    # temp_ind = []

    # for i in range(len(town['state'])):
    #     if town[town['state'] != 'Unknown']:
    #         temp_ind.append(town['index'])


    # for i in range(len(temp_ind)):
    #     resorts_info_df['state'][temp_ind[i]] = state_name

    # resorts_info_df

      link  price closest_town  \
386  https://www.skiresort.info/ski-resort/pinnacle...  Empty   Pittsfield   

    region total_len easy_len intermediate_len difficult_len geometry state  \
386  Maine       0.7      0.3              0.2           0.2     None  None   

    website  
386    None  
Empty DataFrame
Columns: [index, name, link, price, closest_town, region, total_len, easy_len, intermediate_len, difficult_len, geometry, state, website]
Index: []
Empty DataFrame
Columns: [index, name, link, price, closest_town, region, total_len, easy_len, intermediate_len, difficult_len, geometry, state, website]
Index: []
Empty DataFrame
Columns: [index, name, link, price, closest_town, region, total_len, easy_len, intermediate_len, difficult_len, geometry, state, website]
Index: []
Empty DataFrame
Columns: [index, name, link, price, closest_town, region, total_len, easy_len, intermediate_len, difficult_len, geometry, state, website]
Index: []
Empty DataFrame
Columns: [index, name, link,