In [1]:
import requests
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from fuzzywuzzy import process


# List of raw URLs to the .pickle files
urls = [
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/distance_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/flights_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/flights_df_all_fields.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/geos_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/msa_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/88761104ba44f569e5fadde614eb5ed6a54900ba/data/pickled/tti_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/city_pair_df.pickle',
]

loaded_data = []

for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        # Load the content of the file
        data = pickle.loads(response.content)
        loaded_data.append(data)
    else:
        print(f"Failed to retrieve {url}")

Failed to retrieve https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/city_pair_df.pickle


In [2]:
# Convert loaded data to DataFrames
distance_df, flights_df, flights_df_all_fields, geos_df, msa_df, tti_df = map(pd.DataFrame, loaded_data)


In [3]:
print(tti_df.columns)

Index(['AreaGroup', 'MetroArea', 'StateCode', 'PopulationGroup', 'Year',
       'Population_thousands', 'PopulationRank', 'AutoCommuters',
       'FreewayDailyVehicle_miles_thousands',
       'ArterialStreetDailyVehicle_miles_thousands', 'ValueOfTime',
       'CommercialValueOfTime', 'AverageStateGasCost',
       'AverageStateDieselCost', 'CongestedTravel', 'CongestedSystem',
       'NumberOfRushHours', 'TotalGallons_thousands', 'TotalGallonsRank',
       'GallonsPerAutoCommuter', 'GallonsPerAutoCommuterRank', 'TotalDelay',
       'TotalDelayRank', 'DelayPerAutoCommuter', 'DelayPerAutoCommuterRank',
       'TravelTimeIndexValue', 'TravelTimeIndexRank',
       'CommuterStressIndexValue', 'CommuterStressIndexRank',
       'FreewayPlanningTimeIndexValue', 'FreewayPlanningTimeIndexRank',
       'AnnualCongestionCostTotalDollars_millions', 'AnnualCongestionCostRank',
       'AnnualCongestionCostPerAutoCommuter',
       'AnnualCongestionCostPerAutoCommuterRank', 'TruckTotalDelay_thousands',


In [3]:
city_column_name = 'MetroArea'  

def get_best_match(city, choices):
    match, score = process.extractOne(city, choices)
    return match if score > 80 else None 

# Fuzzy matching to align TTI city names with other datasets
all_city_names = set(msa_df['MainCity']).union(set(distance_df['Origin']), set(distance_df['Destination']))
tti_df['Matched_City'] = tti_df[city_column_name].apply(lambda x: get_best_match(x, all_city_names))

# Convert TTI DataFrame to a dictionary for quick lookup
tti_data = tti_df.set_index('Matched_City')['TravelTimeIndexValue'].to_dict()


In [4]:
# Define scoring functions
def score_population(pop):
    pop = int(pop.replace(',', '')) if isinstance(pop, str) else pop
    return (pop / max(city_population.values())) * 100

def score_distance(dist):
    if dist <= 150:
        return ((dist / 100) + 1) * 10
    elif 150 < dist <= 300:
        return 100  # Optimal distance score
    elif 300 < dist <= 350:
        return (((500 - dist) / 100) + 0.5) * 10
    elif 350 < dist <= 1000:
        return ((1000 - dist) / 100) * 10
    else:
        return 0

def score_gdp(gdp):
    return (gdp / max(city_gdp.values())) * 100

def score_congestion(ttis, city):
    return (ttis.get(city, 0) / max(ttis.values())) * 100

def score_emissions(distance):
    # Emission factors in kgCO2e per passenger km
    hsr_emissions_factor = 0.0045
    air_travel_emissions_factor = 0.115
    hsr_emissions = distance * hsr_emissions_factor
    air_emissions = distance * air_travel_emissions_factor
    emissions_savings = air_emissions - hsr_emissions
    max_savings = air_travel_emissions_factor * 1000 - hsr_emissions_factor * 1000
    return (emissions_savings / max_savings) * 100

In [11]:
# Ensure the necessary dataframes are loaded
msa_merged_df = pd.merge(msa_df, geos_df, left_on='MainCity', right_on='MainCity')

# Create dictionaries for quick lookup of population and GDP
city_population = msa_merged_df.set_index('MainCity')['Population'].to_dict()
city_gdp = msa_merged_df.set_index('MainCity')['GDP_thousands_dollars'].to_dict()

# Define the function to score city pairs
def score_city_pairs(city_pairs):
    # Calculate scores for each city pair
    city_pairs['Population_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_population(city_population.get(x, 0)))
    city_pairs['Population_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_population(city_population.get(x, 0)))
    city_pairs['Distance_Score'] = city_pairs['Distance_miles'].map(score_distance)
    city_pairs['GDP_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
    city_pairs['GDP_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
    city_pairs['Congestion_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_congestion(tti_data, x))
    city_pairs['Congestion_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_congestion(tti_data, x))
    city_pairs['Emissions_Score'] = city_pairs['Distance_miles'].map(lambda x: score_emissions(x * 1.60934))  # Convert miles to km

    # Sum the scores to get a total score for each city pair
    city_pairs['Total_Score'] = city_pairs[['Population_Score_Origin', 'Population_Score_Destination', 
                                            'Distance_Score', 'GDP_Score_Origin', 'GDP_Score_Destination', 
                                            'Congestion_Score_Origin', 'Congestion_Score_Destination', 
                                            'Emissions_Score']].sum(axis=1)

    # Normalize the total score to a scale of 100
    city_pairs['Normalized_Score'] = (city_pairs['Total_Score'] / city_pairs['Total_Score'].max()) * 100

    return city_pairs

# Apply the scoring to the city pairs and filter based on distance
city_pairs = score_city_pairs(distance_df[['Origin', 'Destination', 'Distance_miles']].copy())
filtered_city_pairs = city_pairs[
    ~city_pairs['Origin'].str.contains(', AK|, HI') &
    ~city_pairs['Destination'].str.contains(', AK|, HI') &
    (city_pairs['Distance_miles'] <= 1000)
]

In [12]:
# Rank and display the top 50 city pairs
top_50_filtered_city_pairs = filtered_city_pairs.nlargest(50, 'Normalized_Score')
print(top_50_filtered_city_pairs[['Origin', 'Destination', 'Distance_miles', 
                                  'Population_Score_Origin', 'Population_Score_Destination', 
                                  'Distance_Score', 'GDP_Score_Origin', 
                                  'GDP_Score_Destination', 'Congestion_Score_Origin', 
                                  'Congestion_Score_Destination', 'Emissions_Score', 
                                  'Total_Score', 'Normalized_Score']])

                  Origin       Destination  Distance_miles  \
25629        Chicago, IL      New York, NY      789.945845   
93337       New York, NY       Chicago, IL      789.789260   
92838       New York, NY       Atlanta, GA      873.701064   
6339         Atlanta, GA      New York, NY      864.373664   
96326       New York, NY    Washington, DC      228.846590   
141429    Washington, DC      New York, NY      224.769153   
17859         Boston, MA      New York, NY      216.211632   
93132       New York, NY        Boston, MA      215.271497   
118229     St. Louis, MO      New York, NY      952.860622   
95722       New York, NY     St. Louis, MO      949.868721   
37179        Detroit, MI      New York, NY      614.315332   
93634       New York, NY       Detroit, MI      614.145698   
94337       New York, NY  Jacksonville, FL      940.045467   
64229   Jacksonville, FL      New York, NY      932.546140   
14039     Birmingham, AL      New York, NY      962.446513   
93036   

In [13]:
# Display the top 50 city pairs
print(top_50_filtered_city_pairs[['Origin', 'Destination',  
                         'Total_Score', 'Normalized_Score']])

                  Origin       Destination  Total_Score  Normalized_Score
25629        Chicago, IL      New York, NY   593.602926         56.288361
93337       New York, NY       Chicago, IL   593.593385         56.287456
92838       New York, NY       Atlanta, GA   565.977585         53.668790
6339         Atlanta, GA      New York, NY   565.409229         53.614896
96326       New York, NY    Washington, DC   555.785558         52.702332
141429    Washington, DC      New York, NY   555.129360         52.640108
17859         Boston, MA      New York, NY   540.505472         51.253398
93132       New York, NY        Boston, MA   540.354172         51.239051
118229     St. Louis, MO      New York, NY   530.759952         50.329280
95722       New York, NY     St. Louis, MO   530.577644         50.311993
37179        Detroit, MI      New York, NY   526.524071         49.927613
93634       New York, NY       Detroit, MI   526.513734         49.926633
94337       New York, NY  Jacksonville

In [10]:
# Filter city pairs to exclude distances greater than 1000 miles
filtered_city_pairs = city_pairs[city_pairs['Distance_miles'] <= 1000].copy()

# Apply the scoring functions to the filtered dataset
filtered_city_pairs['Population_Score_Origin'] = filtered_city_pairs['Origin'].map(lambda x: score_population(city_population.get(x, 0)))
filtered_city_pairs['Population_Score_Destination'] = filtered_city_pairs['Destination'].map(lambda x: score_population(city_population.get(x, 0)))
filtered_city_pairs['Distance_Score'] = filtered_city_pairs['Distance_miles'].map(score_distance)
filtered_city_pairs['GDP_Score_Origin'] = filtered_city_pairs['Origin'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
filtered_city_pairs['GDP_Score_Destination'] = filtered_city_pairs['Destination'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
filtered_city_pairs['Congestion_Score_Origin'] = filtered_city_pairs['Origin'].map(lambda x: score_congestion(tti_data, x))
filtered_city_pairs['Congestion_Score_Destination'] = filtered_city_pairs['Destination'].map(lambda x: score_congestion(tti_data, x))
filtered_city_pairs['Emissions_Score'] = filtered_city_pairs['Distance_miles'].map(lambda x: score_emissions(x * 1.60934))  # Convert miles to km

# Calculate total and normalized scores for filtered city pairs
filtered_city_pairs['Total_Score'] = filtered_city_pairs[['Population_Score_Origin', 'Population_Score_Destination', 
                                                          'Distance_Score', 'GDP_Score_Origin', 'GDP_Score_Destination', 
                                                          'Congestion_Score_Origin', 'Congestion_Score_Destination', 
                                                          'Emissions_Score']].sum(axis=1)

filtered_city_pairs['Normalized_Score'] = (filtered_city_pairs['Total_Score'] / filtered_city_pairs['Total_Score'].max()) * 100

# Rank the filtered city pairs by their normalized scores
top_50_filtered_city_pairs = filtered_city_pairs.nlargest(50, 'Normalized_Score')

# Display the top 50 city pairs
print(top_50_filtered_city_pairs[['Origin', 'Destination', 'Distance_miles', 
                                  'Population_Score_Origin', 'Population_Score_Destination', 
                                  'Distance_Score', 'GDP_Score_Origin', 
                                  'GDP_Score_Destination', 'Congestion_Score_Origin', 'Congestion_Score_Destination', 
                                  'Emissions_Score', 'Total_Score', 'Normalized_Score']])


                   Origin        Destination  Distance_miles  \
25629         Chicago, IL       New York, NY      789.945845   
93337        New York, NY        Chicago, IL      789.789260   
92838        New York, NY        Atlanta, GA      873.701064   
6339          Atlanta, GA       New York, NY      864.373664   
96326        New York, NY     Washington, DC      228.846590   
141429     Washington, DC       New York, NY      224.769153   
17859          Boston, MA       New York, NY      216.211632   
93132        New York, NY         Boston, MA      215.271497   
23632         Chicago, IL         Boston, MA      983.803034   
16057          Boston, MA        Chicago, IL      983.560078   
139627     Washington, DC        Chicago, IL      696.210166   
26826         Chicago, IL     Washington, DC      696.055445   
118229      St. Louis, MO       New York, NY      952.860622   
95722        New York, NY      St. Louis, MO      949.868721   
4537          Atlanta, GA        Chicago

In [17]:
city_pair_weights_df = pd.read_pickle('city_pair_weights_df.pickle')
msa_weights_df = pd.read_pickle('msa_weights_df.pickle')

print(city_pair_weights_df.head())
print(msa_weights_df.head())

         City1            City2  drive_distance  drive_duration  \
0  Abilene, TX        Akron, OH     1327.729086          1159.0   
1  Abilene, TX       Albany, GA      982.946785           910.0   
2  Abilene, TX       Albany, NY     1812.063858          1578.5   
3  Abilene, TX       Albany, OR     1912.769767          1746.5   
4  Abilene, TX  Albuquerque, NM      487.537939           448.5   

   flight_distance  flight_duration_ramp  flight_duration_air  \
0      1169.268938                   0.0                  0.0   
1       915.647527                   0.0                  0.0   
2      1581.642374                   0.0                  0.0   
3      1511.136717                   0.0                  0.0   
4       437.507433                   0.0                  0.0   

   flight_duration_total  num_passengers  total_seats  ...  \
0                    0.0             0.0          0.0  ...   
1                    0.0             0.0          0.0  ...   
2                   

In [18]:
print("\nCity Pair Weights DataFrame Info:")
print(city_pair_weights_df.info())

print("\nMSA Weights DataFrame Info:")
print(msa_weights_df.info())

print("\nCity Pair Weights DataFrame Description:")
print(city_pair_weights_df.describe())

print("\nMSA Weights DataFrame Description:")
print(msa_weights_df.describe())


City Pair Weights DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69378 entries, 0 to 69377
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   City1                       69378 non-null  object 
 1   City2                       69378 non-null  object 
 2   drive_distance              69378 non-null  float64
 3   drive_duration              69378 non-null  float64
 4   flight_distance             69378 non-null  float64
 5   flight_duration_ramp        69378 non-null  float64
 6   flight_duration_air         69378 non-null  float64
 7   flight_duration_total       69378 non-null  float64
 8   num_passengers              69378 non-null  float64
 9   total_seats                 69378 non-null  float64
 10  flight_frequency            69378 non-null  float64
 11  scheduled                   69378 non-null  float64
 12  num_carriers                69378 non-null  float64
 

In [5]:
# Assuming `city_pair_weights_df` contains the weights and `distance_df` contains the base data
# Merge the weights into the `city_pairs` DataFrame
city_pairs = pd.merge(distance_df[['Origin', 'Destination', 'Distance_miles']], 
                      city_pair_weights_df[['City1', 'City2', 'gaus_dist_weight', 'demand_weight']],
                      left_on=['Origin', 'Destination'], 
                      right_on=['City1', 'City2'],
                      how='left')

# Merge the MSA weights (if necessary)
city_pairs = pd.merge(city_pairs, 
                      msa_weights_df[['MetroArea', 'demand_score', 'proximity_score', 'cluster_id']],
                      left_on='Origin', 
                      right_on='MetroArea',
                      how='left')
city_pairs = pd.merge(city_pairs, 
                      msa_weights_df[['MetroArea', 'demand_score', 'proximity_score', 'cluster_id']],
                      left_on='Destination', 
                      right_on='MetroArea',
                      how='left',
                      suffixes=('_Origin', '_Destination'))

# Now proceed with scoring using the `score_city_pairs_updated` function


NameError: name 'city_pair_weights_df' is not defined

In [None]:
# Define the function to score city pairs with updated criteria
def score_city_pairs_updated(city_pairs):
    # Calculate scores for each city pair
    city_pairs['Distance_Score'] = city_pairs['Distance_miles'].map(score_distance)
    city_pairs['GDP_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
    city_pairs['GDP_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
    city_pairs['Congestion_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_congestion(tti_data, x))
    city_pairs['Congestion_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_congestion(tti_data, x))
    city_pairs['Emissions_Score'] = city_pairs['Distance_miles'].map(lambda x: score_emissions(x * 1.60934))  # Convert miles to km
    
    # Include the additional weights in the scoring
    city_pairs['Gaussian_Score'] = city_pairs['gaus_dist_weight']
    city_pairs['Demand_Score'] = city_pairs['demand_weight'] + city_pairs['demand_score_Origin'] + city_pairs['demand_score_Destination']
    city_pairs['Proximity_Score'] = city_pairs['proximity_score_Origin'] + city_pairs['proximity_score_Destination']
    
    # Add cluster-based scoring (more weight towards cluster alignment)
    city_pairs['Cluster_Score'] = np.where(city_pairs['cluster_id_Origin'] == city_pairs['cluster_id_Destination'], 15, 7)
    city_pairs['Cluster_Score'] = np.where(city_pairs['Demand_Score'] > 5, 15, city_pairs['Cluster_Score'])
    
    # Sum the scores to get a total score for each city pair with adjusted weights
    city_pairs['Total_Score'] = (
        0.4 * (city_pairs['Distance_Score']) +  # Reduced weight on distance
        0.8 * (city_pairs['GDP_Score_Origin'] + city_pairs['GDP_Score_Destination']) + 
        1.2 * (city_pairs['Congestion_Score_Origin'] + city_pairs['Congestion_Score_Destination']) + 
        1.5 * city_pairs['Emissions_Score'] + 
        1.8 * city_pairs['Gaussian_Score'] + 
        1.8 * city_pairs['Demand_Score'] + 
        1.5 * city_pairs['Proximity_Score'] + 
        2.0 * city_pairs['Cluster_Score']
    )

    # Normalize the total score to a scale of 100
    max_score = city_pairs['Total_Score'].max()
    city_pairs['Normalized_Score'] = (city_pairs['Total_Score'] / max_score) * 100

    return city_pairs

# Apply the updated scoring to the city pairs and filter based on distance
city_pairs = score_city_pairs_updated(city_pairs)
filtered_city_pairs = city_pairs[
    ~city_pairs['Origin'].str.contains(', AK|, HI') &
    ~city_pairs['Destination'].str.contains(', AK|, HI') &
    (city_pairs['Distance_miles'] <= 1000)
]

# Rank the filtered city pairs by their normalized scores
top_50_filtered_city_pairs = filtered_city_pairs.nlargest(50, 'Normalized_Score')

# Display the top 50 city pairs
print(top_50_filtered_city_pairs[['Origin', 'Destination', 'Distance_miles', 
                                  'Distance_Score', 'GDP_Score_Origin', 
                                  'GDP_Score_Destination', 'Congestion_Score_Origin', 'Congestion_Score_Destination', 
                                  'Emissions_Score', 'Gaussian_Score', 'Demand_Score', 
                                  'Proximity_Score', 'Cluster_Score', 'Total_Score', 'Normalized_Score']])
