In [1]:
# First we want to set the stage and import all required packages as well as the data to perform the analysis.
# The data is imported efficiently, loading all data sets in a dictonary where each key (geography)
# holds a list of dataframes with all route information within that geography split by state.
# Simultaneously the names of each geography gets extracted from the folder names and the names of each state
# from the file names.

# Load the required packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import glob

# Collect folder paths of the data
all_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-usa-routes-aug-2020"
ca_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-ca"
midwest_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-midwest"
northeast_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-northeast"
southeast_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-southeast"
westcoast_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-westcoast"
m1_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-mountains1"
m2_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-mountain2"
ms2_dir = "G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-routes-mountains2"

# Extract and save the data found in all_dir that holds all route information of the USA from 2020 and save it to all.
all = pd.read_json("G:/My Drive/Trainings/Python/Trainings_file/climbing project/openbeta-usa-routes-aug-2020/openbeta-usa-routes-aug-2020.jsonlines", lines= True)


# Create a list of all folder path strings, excluding all_dir
folder_paths = [ca_dir, midwest_dir, northeast_dir, southeast_dir, westcoast_dir, m1_dir, m2_dir, ms2_dir]

# Extract the other geographies:

geographies_data = {}  # Initialize an empty dictionary to store the resulting list of dataframes for each geography
geographies_states = {} # Initialize an empty dictionary to store the state name strings in lists for each geography

# Here we fill the dictinary geography with a list of dataframes for each geograpy. Each dataframe corresponds to a US state:

for folder_path in folder_paths:
    file_list = glob.glob(folder_path + '/*.jsonlines')  # Get a list of all JSON files in the folder
    file_list.sort() # Sort the list aphabetically, since glob does not do it

    # Extract the state names from the file names in each folder.
    file_list_trunc = [] # Initialize empty list
    for file in file_list : # For each file in the file_list truncate the string to only keep the state names
        file_trunc = file.split('-routes.jsonline', 1)[0]
        file_trunc = file_trunc.split('\\', 1)[-1]
        file_list_trunc.append(file_trunc) # Append the state name to the list of statenames

    folder_data_frames = []  # List to store the dataframes for each geography

    for file_name in file_list:
        data_frame = pd.read_json(file_name, lines = True)
        folder_data_frames.append(data_frame)

        folder_name = os.path.basename(folder_path) # Extract the folder name
        # Strip the first part of the folder name, only keeping the last element of the split.
        folder_name = folder_name.split('routes-', 1)[-1] 
        
    # Add the list of DataFrames and state names to the dictionary with the folder name as the key.
    # The names need to be unique for later variable extraction, so we add _data or _state to the string.
    
    geographies_data[folder_name + "_data"] = folder_data_frames
    geographies_states[folder_name + "_state"] = file_list_trunc

In [2]:
# Having the data imported, we now want to wrangle the data. 
# This code chunk saves each geography in its own variable and assigns key:value pairs to the dataframes for each state
# Afterwards each state will get its own variable with all the route information from that state.


# look at the keys generated for the two dictionary
print(geographies_data.keys())

print(geographies_states.keys())

# Extract the keys from the dictionaries as variables with the list values defining those variables.
# Save the list of keys in a list.

list_of_state_names = []
list_of_dataframes = []

for key, value in geographies_data.items(): # geographies_data.items() provides a way to access and work with the individual key-value pairs of the dictionary.
    globals()[key] = value 
    # This line dynamically creates a new variable within the loop to add to the global namespace with the name of the current key, 
    # and assigns it to the corresponding value.
    list_of_dataframes.append(globals()[key]) # fill the list with the dataframes.

for key, value in geographies_states.items():
    globals()[key] = value
    list_of_state_names.append(globals()[key]) 

# After having each region as a variable with a list of dataframes or a list of their state names,
# we want to combine them. Each state name should become a variable for its corresponding data frame.
# States that appear as duplicates are tracked and saved seperately.

# to track the count of each state name across region and to avoid overwriting data frames of  states that already appeared in another geopgraphy,
# we need to keep track of their counts. This helps control duplicates.

state_counts = {} # initialize dictionary to track state counts. 
duplicates = {} # initialize dictionary to track duplicate states. 
list_of_all_states_and_data = {} # initialze dictionary without duplicates

# Here we create a nested for loop that iterates over the lists containing the state names and dataframes.
# Zip allows for iteration at the same time. For each item of the lists we have lists.
# The nested for loop therefore iterates over the items of the lists lists.

for states, dataframes in zip(list_of_state_names, list_of_dataframes) :
    for state, dataframe in zip(states, dataframes) :
        # Add state to the global namespace if it does not exist so far and set its count to 1.
        if state not in globals():
            globals()[state] = dataframe
            state_counts[state] = 1
            list_of_all_states_and_data[state] = dataframe
        else:
        # If state already exists, save the duplicate in a sperate variable, increase its count by 1 and add the count value to the variable name.   
            count = state_counts.get(state, 1) + 1
            state_counts[state] = count
            state_i = state + "_" + str(count)
            globals()[state_i] = dataframe
            duplicates[state] = count


dict_keys(['ca_data', 'midwest_data', 'northeast_data', 'southeast_data', 'westcoast_data', 'mountains1_data', 'mountain2_data', 'mountains2_data'])
dict_keys(['ca_state', 'midwest_state', 'northeast_state', 'southeast_state', 'westcoast_state', 'mountains1_state', 'mountain2_state', 'mountains2_state'])


In [25]:
# This code chunk compares the duplicated state dfs and joins their data frames if they are not equal without duplication.
# If the data is equal, we ignore the second data set of that state.
# This code chunk then adds a State column to each data frame containing the state name, 
# to later identify from which state that routes is from. 
# We then merge all state dfs into a single df.

# First we check if data frames of state duplicates are identical.

print(duplicates)

are_equal = mi.equals(mi_2)

print(are_equal)

# Looking at the namespace and size (rows) of all duplicate data frames 
# It is fair to assume that all data frames in the list duplicates are actually duplicates.
# They can therefore be ignored in our further analysis and we continue with the list of all states filled earlier.


len(list_of_all_states_and_data) # How many states

# Add the key (state) as a separate column to each DataFrame
for state, df in list_of_all_states_and_data.items():
    df['State'] = state

# Merge all dfs of the individual states into a single df containing all routes
df_all_states = pd.concat(list_of_all_states_and_data.values(), ignore_index=True)



{'mi': 2, 'az': 2, 'id': 2, 'mt': 2, 'nm': 2, 'nv': 2, 'wy': 2}
False
183253
Index(['route_name', 'grade', 'safety', 'type', 'fa', 'description',
       'location', 'protection', 'metadata'],
      dtype='object')
209808
Index(['route_name', 'grade', 'safety', 'type', 'fa', 'description',
       'location', 'protection', 'metadata', 'mp_sector_id', 'mp_route_id',
       'State'],
      dtype='object')


In [None]:
# This code chunk compares the two data frames, the published data frame containing all data from August 2020
# and our generated df of all routes from all states.

# Make sure there are no duplicates
all.drop_duplicates(subset = 'route_name')
df_all_states.drop_duplicates(subset = 'route_name')

# Compare the length and the columns of the two dfs.
print(len(all))
print(all.columns)
print(len(df_all_states))
print(df_all_states.columns)


Unnamed: 0,route_name,grade,safety,type,fa,description,location,protection,metadata
0,Wheres Waldo?,"{'YDS': 'V2', 'Font': '5+'}",,{'boulder': True},unknown,[Sit Start on the crack. Pull a big move to a ...,,[Pads],"{'left_right_seq': '999999', 'parent_lnglat': ..."
1,Unknown,{},,"{'tr': True, 'ice': True}",Unkown,[Just a general entry for the routes. Usually ...,[Can't miss the silo with a giant sheet of ice...,[No gear needed. All supplied and is Top Rope],"{'left_right_seq': '0', 'parent_lnglat': [-92...."
2,Vanished Edens,"{'YDS': 'V4', 'Font': '6B'}",,{'boulder': True},"Joe Feldman, 2019",[Start right hand in a sidepull slot and left ...,[Hot Stuff Camp Roof],[pad - good landing],"{'left_right_seq': '1', 'parent_lnglat': [-91...."
3,Stairway to Heaven,"{'YDS': '5.7', 'French': '5a', 'Ewbanks': '15'...",,"{'trad': True, 'tr': True}",unknown,[Climb the large flake right of Slot Machine t...,,"[SR, tricams are handy.]","{'left_right_seq': '5', 'parent_lnglat': [-91...."
4,Shagadelic Humper Bumper,"{'YDS': '5.8', 'French': '5b', 'Ewbanks': '16'...",,{'tr': True},unknown,[Climb the buttress left of Cake Walk.],,[Build a TR anchor on off of trees above.],"{'left_right_seq': '999999', 'parent_lnglat': ..."
...,...,...,...,...,...,...,...,...,...
183248,State of Delusion,"{'YDS': '5.10b', 'French': '6a+', 'Ewbanks': '...",,"{'trad': True, 'sport': True}","Ron Cotman, Gordon Briordy",[Start on the thin seam climbers left of Halcy...,[Starts on the thin seam in between Halcyon an...,"[Gear to 2"". Small nuts are great. 4 bolts]","{'left_right_seq': '2', 'parent_lnglat': [-120..."
183249,Pistachio Pillar,"{'YDS': '5.10c', 'French': '6b', 'Ewbanks': '2...",,{'sport': True},Ron Cotman 2003,[A tricky route following the green streak up ...,[Center of The Nuthouse],[7 bolts],"{'left_right_seq': '1', 'parent_lnglat': [-120..."
183250,Halcyon Daze,"{'YDS': '5.11d', 'French': '7a', 'Ewbanks': '2...",,{'sport': True},"Alec Gibbons, Brian Behle, 2005",[A very long sport route with multiple roof pu...,[Right where the trail comes into The Nuthouse...,[13 bolts],"{'left_right_seq': '3', 'parent_lnglat': [-120..."
183251,Commited,"{'YDS': '5.11c', 'French': '6c+', 'Ewbanks': '...",,{'trad': True},Tony Bentley and Ron Cotman,[The obvious sweeping corner on the wall. Basi...,[Uphill from Halcyon Daze on the far left side...,"[Tips to 3""]","{'left_right_seq': '0', 'parent_lnglat': [-120..."
