# Data and Library Imports

In [1]:
#Python libraries needed
import numpy as np
import pandas as pd

# opens and previews the file cocreated with ChatGPT
df = pd.read_csv('wildernesses.csv')
print(df.head())

                      Name                       Park Type  \
0  Jennie Lakes Wilderness  U.S. Forest Service Wilderness   
1    Desolation Wilderness  U.S. Forest Service Wilderness   
2     Mokelumne Wilderness  U.S. Forest Service Wilderness   
3      Emigrant Wilderness  U.S. Forest Service Wilderness   
4   Silver Peak Wilderness  U.S. Forest Service Wilderness   

        Elevation Range                                         Activities  \
0  6,640 ft – 10,365 ft  Hiking, backpacking, horsepacking, fishing, cr...   
1   8,200 ft – 9,900 ft  Hiking, backpacking, fishing, camping, horseba...   
2  3,700 ft – 10,381 ft     Hiking, backpacking, fishing, horseback riding   
3  3,500 ft – 11,067 ft              Hiking, backpacking, fishing, camping   
4     600 ft – 3,200 ft                 Hiking, backpacking, bird watching   

                                        Access Rules            Summer Traffic  
0  No overnight camping permit required unless en...  Medium-heavy to moderat

# Feature Engineering
### Column: camping_permit
Changing text values into a binary value

In [2]:
# Prints the permit categories with counts
print(df['Access Rules'].value_counts())

Access Rules
No overnight camping permit required                                           60
Overnight camping permit required                                              28
No overnight camping permit required unless entering adjacent national park     1
Name: count, dtype: int64


In [4]:
# Fills a new column with a value of 1 if the camping permit is always required, otherwise 0.
df['camping_permit'] = df['Access Rules'].apply(lambda x: '1' if x == 'Overnight camping permit required' else 0)

### Column: summer_traffic_ordinal
Changing text categories to ordinal numbers

In [5]:
# Prints the traffic categories to engineer as a reference
print(df['Summer Traffic'].value_counts())

Summer Traffic
Low                         30
Moderate                    27
Medium-heavy to moderate    12
Medium-heavy                10
Low to moderate              9
Medium                       1
Name: count, dtype: int64


In [6]:
# Maps the categories to their ordinal value with a dictionary. I decide Moderate and Medium are equivalent.
traffic_mapping = {
    'Low': 0, 
    'Low to moderate': 1, 
    'Moderate': 2, 'Medium': 2, 
    'Medium-heavy to moderate': 3, 
    'Medium-heavy': 4
}

In [7]:
# Replaces the traffic catgories into their ordinal values
df['summer_traffic_ordinal'] = df['Summer Traffic'].replace(traffic_mapping)

  df['summer_traffic_ordinal'] = df['Summer Traffic'].replace(traffic_mapping)


### Columns: Activity Categories
Manually One Hot Encoding activity list strings
<br>Source data contains a string of activities related to the wilderness. For example: backpacking, bird watching, horsepacking
<br>I employed a lambda function to check if one of the substrings are in the parent string
<br>
<br> Here is a list of the categories I created and their related source values:
<br>Category: values from ChatGPT
<br>backpacking: backpacking 
<br>camping: camping
<br>wildlife: wildlife viewing, bird watching, birdwatching
<br>geology: geology, geological exploration, lava tube exploration, rockhounding
<br>solitude: solitude, access solitude seeking
<br>photography: photography, nature photography
<br>fishing: fishing
<br>rafting: rafting, rafting nearby
<br>sand_dunes: sand dunes exploration, sand dune exploration
<br>horses: horseback riding, horsepacking
<br>nature_study: nature study, botanizing
<br>climbing: mountaineering, climbing
<br>beach: beach access
<br>skiing: cross-country skiing, skiing

In [8]:
# Creates the one hot encoded columns by category as described above
df['backpacking'] = df['Activities'].apply(lambda x: 1 if 'ackpack' in x else 0)
df['camping'] = df['Activities'].apply(lambda x: 1 if 'amping' in x else 0)
df['wildlife'] = df['Activities'].apply(lambda x: 1 if 'wildlife' in x else (1 if 'bird' in x else 0))
df['geology'] = df['Activities'].apply(lambda x: 1 if 'geolog' in x else (1 if 'lava tube' in x else(1 if 'rockhounding' in x else 0)))
df['solitude'] = df['Activities'].apply(lambda x: 1 if 'solitude' in x else 0)
df['photography'] = df['Activities'].apply(lambda x: 1 if 'photography' in x else 0)
df['fishing'] = df['Activities'].apply(lambda x: 1 if 'fishing' in x else 0)
df['rafting'] = df['Activities'].apply(lambda x: 1 if 'rafting' in x else 0)
df['sand_dunes'] = df['Activities'].apply(lambda x: 1 if 'sand dune' in x else 0)
df['horses'] = df['Activities'].apply(lambda x: 1 if 'horse' in x else 0)
df['nature_study'] = df['Activities'].apply(lambda x: 1 if 'study' in x else (1 if 'botan' in x else(1 if 'watching' in x else 0)))
df['climbing'] = df['Activities'].apply(lambda x: 1 if 'climbing' in x else (1 if 'mountaineering' in x else 0))
df['beach'] = df['Activities'].apply(lambda x: 1 if 'beach' in x else 0)
df['skiing'] = df['Activities'].apply(lambda x: 1 if 'skiing' in x else 0)

### Columns: min_elevation and max_elevation
Source format: 6,640 ft – 10,365 ft
<br>Uses Regex to parse the elevation range string and extract both numbers

In [9]:
# saves the Regex pattern of the elevation range string
pattern = r'(\d[\d,]*)\s*ft\s*–\s*(\d[\d,]*)\s*ft' 

# Extracts the numbers using the Regex pattern. Generates two columns with the min value in the first column and the max value in the second column
extracted_numbers = df['Elevation Range'].str.extract(pattern)

#Saves the new columns to the main dataframe
df['min_elevation'] = extracted_numbers[0]
df['max_elevation'] = extracted_numbers[1]

## Final Data Prep

In [11]:
# saves the list of park names as a separate dataframe
name_df = df['Name']

# Removes the columns that will not be used in the model and transforms the dataframe into a numpy array.
columns_to_remove = ['Elevation Range','Activities','Access Rules','Summer Traffic', 'Name','Park Type','min_elevation','max_elevation']
model_array = df.drop(columns=columns_to_remove, inplace=False).to_numpy()

## Modeling

In [16]:
from sklearn.neighbors import NearestNeighbors

# Creates an instance of the NearestNeighbors class
recommender = NearestNeighbors(metric='cosine')

# Creates a model by fitting the features to the instance
recommender.fit(model_array)


### My recommended Wildernesses

In [18]:
# Saves the Jennie Lakes Wilderness values 
jennie_lakes = model_array[0].reshape(1, -1)

# Saves the row number of the recommended wildernesses
recommendations = recommender.kneighbors(X=jennie_lakes, n_neighbors=5, return_distance=False)

# Finds the wilderness names from the row numbers
recommended_wildernesses = df.iloc[recommendations[0]]['Name']

# prints the recommendations
print(recommended_wildernesses)

0        Jennie Lakes Wilderness
18    Marble Mountain Wilderness
21     Carson-Iceberg Wilderness
14           Domeland Wilderness
77           Emigrant Wilderness
Name: Name, dtype: object


## Future work
Bin max and min elevation
<br>Normalize elevation ranges
<br>remove Park Type until I include other park types
<br>consider removing significantly empty activity types like solitude
Add terrain types as a feature