In [1]:
import plotly.express as px
import json
import pandas as pd
import dash
import os
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import pydeck as pdk



In [2]:
#load the data from the sentiment_full_updated.xlsx file and load it into a dataframe

series_type = type(pd.Series([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]))

dtype_dict = {
    'date': str,
    'Hiker trail name': str,
    'Hiker Journal Link': str,
    'Journal Story': str,
    'Start location': str,
    'Destination': str,
    'Today Miles': str,
    'Latitude': str,
    'Longitude': str,
    'State': str,
    'Total Shelters': str,
    'Occurrence': str,
    'year': str,
    'month': str,
    'label': str,
    # pandas serial object for type of emotion score
    'Unnamed: 0': str


}


df = pd.read_excel('../sentiment_full_updated_with_coords.xlsx', sheet_name='Sheet1', dtype=dtype_dict)

print(f'length of df: {len(df)}')

length of df: 33307


In [3]:
print(f'columsn in df are {df.columns}')
print(f'number of columns in df is {len(df.columns)}')



columsn in df are Index(['Unnamed: 0', 'date', 'Hiker trail name', 'Hiker Journal Link',
       'Journal Story', 'Start location', 'Destination', 'Today Miles',
       'Latitude', 'Longitude', 'State', 'Total Shelters', 'Occurrence',
       'year', 'month', 'label', 'Emotion_scores'],
      dtype='object')
number of columns in df is 17


In [4]:
# clean the data frame by dropping the column called 'Unnamed: 0'
df = df.drop(columns=['Unnamed: 0'])
print(f'columsn in df are {df.columns}')
print(f'number of columns in df is {len(df.columns)}')

columsn in df are Index(['date', 'Hiker trail name', 'Hiker Journal Link', 'Journal Story',
       'Start location', 'Destination', 'Today Miles', 'Latitude', 'Longitude',
       'State', 'Total Shelters', 'Occurrence', 'year', 'month', 'label',
       'Emotion_scores'],
      dtype='object')
number of columns in df is 16


In [5]:
#print the data type of each column
print(df.dtypes)
print(df.head()["Hiker trail name"].values[0])

date                  object
Hiker trail name      object
Hiker Journal Link    object
Journal Story         object
Start location        object
Destination           object
Today Miles           object
Latitude              object
Longitude             object
State                 object
Total Shelters        object
Occurrence            object
year                  object
month                 object
label                 object
Emotion_scores        object
dtype: object
Blue


In [6]:
# clean the data frame by dropping the specifiying the data type of each column


# print out a random sample of each of the columns alongside its data type
for col in df.columns:
    print(f'column {col} has data type {type(df[col].values[0])}')
    temp_sample = df[col].sample(1)
    print(temp_sample.values[0])
    print('')

# cast all elemnts in Start location to string
df['Start location'] = df['Start location'].astype(str)

column date has data type <class 'str'>
1969-12-31 19:00:43.311000

column Hiker trail name has data type <class 'str'>
Butch and Sundance

column Hiker Journal Link has data type <class 'str'>
https://www.trailjournals.com//journal/entry/591693

column Journal Story has data type <class 'str'>
Rained hard most of night, but eased early.  I packed up at 6:30 and was first out of camp.  Tread-way continued to be excellent, although muddy and slippery in many places due to prior evening's rain.  The day could be broken down in thirds.  First, we had tough climbs into and out of Horse Gap and Cooper Gap.  The middle third was pleasant rolling trail with a rest/lunch/water filtration stop at Justus Creek.  The last third was sociable finished off by a brutal climb up to Rampart Mountain (followed by new friends enjoying sunset together). [Pack felt heavy second half of day.] [Checked in with Mighty Blue for podcast.]
(Friends: Pritchett, Haillie, Livvy, Scars, El, Vapor (blind), Boss, and 

In [7]:
# grab a row called test_row_with_nan_coords from this link https://www.trailjournals.com//journal/entry/518909
sample = df[df['Hiker Journal Link'] == 'https://www.trailjournals.com//journal/entry/518909']

# print out the sample with the format column names: values
for column in sample.columns:
    print("-------------------------")
    print(column + ': ' + str(sample[column].values[0]))
    print("")
    print(f'type of {column} is {type(sample[column].values[0])}')

    

-------------------------
date: 1969-12-31 19:00:42.370000

type of date is <class 'str'>
-------------------------
Hiker trail name: Blue

type of Hiker trail name is <class 'str'>
-------------------------
Hiker Journal Link: https://www.trailjournals.com//journal/entry/518909

type of Hiker Journal Link is <class 'str'>
-------------------------
Journal Story: I've been saving my first post for years.I had contemplated a thru-hike in 2010, belatedly realizing after much planning and emotional investment that the timing wasn't right. I was left feeling very bummed, thinking I had somehow missed my window. The shakedown hikes I had done convinced me that my dream was real, that I indeed could see myself on the trail for six months....thus making it all the more worse when I called off the hunt.
In 2014, my wife and I took the family to Patagonia. In preparing for that trip, my thru-hiking spark was relit and I quickly realized that 2016, devoid of significant family events, was the pe

In [8]:
# print(df.head()['Emotion_scores'].values[0][3])
# print(df.head()['Emotion_scores'].values[0])
# new_emo = eval(df.head()['Emotion_scores'].values[0])
# print(type(new_emo))

emotion_nan_counter = 0
# loop over all the values in the Emotion_scores column and convert them to dictionaries
for i in range(len(df['Emotion_scores'].values)):
    super_value = df['Emotion_scores'].values[i]
    # print(f'type of super_value is {type(super_value)}')
    # print(f'super_value is {super_value}')
    if type(super_value) != type("string"):
        emotion_nan_counter += 1
        continue
    value = super_value[0]
    # print(value)
    # print(f'type of value is {type(value)}')

print(f'number of nan values in Emotion_scores is {emotion_nan_counter}')
print(f'number of non-nan values in Emotion_scores is {len(df["Emotion_scores"].values) - emotion_nan_counter}')
print(f'total number of values in Emotion_scores is {len(df["Emotion_scores"].values)}')

number of nan values in Emotion_scores is 7558
number of non-nan values in Emotion_scores is 25749
total number of values in Emotion_scores is 33307


In [9]:
# drop any rows that have do not have objects of type string in the Emotion_scores column in their df['Emotion_scores'].values[i]
df = df[df['Emotion_scores'].apply(lambda x: type(x) == type("string"))]

print(f'number of rows in df is {len(df)}')
# using eval to convert the string to a dictionary do that on the entire colum of Emotion_scores

df['Emotion_scores'] = df['Emotion_scores'].apply(lambda x: eval(x))



# print the type of the first element in the Emotion_scores column
print(type(df['Emotion_scores'].values[0]))





number of rows in df is 25749
<class 'list'>


In [10]:
new_thing = df.head(1)['Emotion_scores'][0][0]['score']
print(type(new_thing))
print(new_thing)

<class 'float'>
0.0031451925169676542


In [11]:
sample_new_lat = sample['Latitude'].values[0]
print(sample_new_lat)
print(type(sample_new_lat))

sample = df[df['Hiker Journal Link'] == 'https://www.trailjournals.com//journal/entry/519033']
sample_new_lat = sample['Latitude'].values[0]
print(sample_new_lat)
print(type(sample_new_lat))


nan
<class 'float'>
34.65569448
<class 'str'>


In [12]:
# print the values of the latitude column in the df if the type of the value is not a float
count = 0
for i in range(len(df['Latitude'].values)):
    value = df['Latitude'].values[i]
    if type(value) != type('string'):
        count += 1
        # print(value)
        # print(type(value))
print(f'number of float values in Latitude is {count}')
print(f'number of non-float values in Latitude is {len(df["Latitude"].values) - count}')
print(f'total number of values in Latitude is {len(df["Latitude"].values)}')

number of float values in Latitude is 13561
number of non-float values in Latitude is 12188
total number of values in Latitude is 25749


In [13]:
# loop over every row in the df and print the value of the lat and long columns
import math

woah_count = 0
not_woah_count = 0
invalid_coords = []
for i in range(len(df['Latitude'].values)):
    lat = df['Latitude'].values[i]
    long = df['Longitude'].values[i]
    if type(lat) != type('string'):
        if math.isnan(lat) or math.isnan(long):
            woah_count += 1
            
            invalid_coords.append(i)
            continue
            # print("woah")
        not_woah_count += 1
    # print(f'lat is {lat} and long is {long}')
    # print(f'type of lat is {type(lat)} and type of long is {type(long)}')
    # print('')

print(f'number of nan values in Latitude is {woah_count}')
print(f'not nan values in Latitude is {not_woah_count}')
print(f'len of invalid_coords is {len(invalid_coords)}')
invalid_coords.sort(reverse=True)

number of nan values in Latitude is 13561
not nan values in Latitude is 0
len of invalid_coords is 13561


In [14]:
from thefuzz import fuzz
from thefuzz import process

exclude_words = [
    "shelter",
    "shelters",
    "SHELTER",
    "sheltered",
    "Shelter",
    "Shelters",
    "SHELTERED",
    "SHELTERS",
]

def custom_scorer(s1, s2):
    s1_tokens = [token for token in s1.split() if token not in exclude_words]
    s2_tokens = [token for token in s2.split() if token not in exclude_words]
    return fuzz.token_set_ratio(s1_tokens, s2_tokens)

In [15]:
# open the weather data file and load it into a dictionary
with open('../weather_data.json') as json_file:
    weather_data = json.load(json_file)
print(f'type of weather_data is {type(weather_data)}')
print(f'len of weather_data is {len(weather_data)}')

type of weather_data is <class 'dict'>
len of weather_data is 39025


In [16]:
found_in_weather_data = 0
bad_weather_data = 0
not_found = 0
# get first key in weather_data
# first_key = list(weather_data.keys())[0]
# print(f'first key in weather_data is {first_key}')
# weather_data_dict = weather_data[first_key]
# print(f'weather_data_dict is {weather_data_dict}')
# print(f'type of weather_data_dict is {type(weather_data_dict)}')
# print(f'leys in weather_data_dict are {weather_data_dict.keys()}')

for idx in invalid_coords:
    # grab 3 values from the df, the journal link, the latitude, and the longitude
    journal_link = df['Hiker Journal Link'].values[idx]
    lat = df['Latitude'].values[idx]
    long = df['Longitude'].values[idx]
    if journal_link in weather_data:
        curr_dict = weather_data[journal_link]
        curr_dict_keys = curr_dict.keys()
        if 'cod' in curr_dict_keys:
            bad_weather_data += 1
        else:
            # replace the "Latitude" and "Longitude" in the df with the values in the curr dict from keys with "lat" and "lon" keys
            df['Latitude'].values[idx] = str(curr_dict['lat'])
            df['Longitude'].values[idx] = str(curr_dict['lon'])
            # print(f'found new lat {curr_dict["lat"]} in weather data')
            # print(f'type of "Latitude" is {type(df["Latitude"].values[idx])}')
            found_in_weather_data += 1
    else:
        not_found += 1


print(f'amount found in weather data is {found_in_weather_data}')
print(f'searched every invalid coord: {found_in_weather_data + bad_weather_data + not_found == len(invalid_coords)}')
print(f'difference between invalid_coords and found_in_weather_data is {len(invalid_coords) - (found_in_weather_data + bad_weather_data + not_found)}')


amount found in weather data is 2254
searched every invalid coord: True
difference between invalid_coords and found_in_weather_data is 0


In [17]:
# drop all the rows that have values in Latitude that are not type string
len_before_drop = len(df)
df = df[df['Latitude'].apply(lambda x: type(x) == type("string"))]
len_after_drop = len(df)
print(f'len_before_drop is {len_before_drop}')
print(f'len_after_drop is {len_after_drop}')
print(f'difference between len_before_drop and len_after_drop is {len_before_drop - len_after_drop}')

len_before_drop is 25749
len_after_drop is 14442
difference between len_before_drop and len_after_drop is 11307


In [18]:
print(f'amount added to the df is {found_in_weather_data}')
print(f'amount not found in weather data is {woah_count - found_in_weather_data}')

amount added to the df is 2254
amount not found in weather data is 11307


In [19]:
# print the unique values in the year column
print(f'unique values in year column are {df["year"].unique()}')
len_before_drop_year = len(df)
df = df[df['year'].apply(lambda x: type(x) == type("string"))]
len_after_drop_year = len(df)
df = df[df['year'].apply(lambda x: len(x) == len("2016"))]
len_after_drop_bad_year = len(df)
print(f'len_before_drop_year is {len_before_drop_year}')
print(f'len_after_drop_year is {len_after_drop_year}')
print(f'difference between len_before_drop_year and len_after_drop_year is {len_before_drop_year - len_after_drop_year}')
print(f'amount of bad year values is {len_after_drop_year - len_after_drop_bad_year}')
print(f'unique values in year column after drop are {df["year"].unique()}')

unique values in year column are ['2016' '2017' '2018' '2019' "701048'" '2020' '2021' '2022' '2023' nan]
len_before_drop_year is 14442
len_after_drop_year is 14437
difference between len_before_drop_year and len_after_drop_year is 5
amount of bad year values is 1
unique values in year column after drop are ['2016' '2017' '2018' '2019' '2020' '2021' '2022' '2023']


In [20]:
print(f'unique values in month column {df["month"].unique()}')

unique values in month column ['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12']


In [21]:
#helper methods
def count_entries_within_radius(
    target_lon, target_lat, radius_miles, currYear, currMonth
):
    # Convert miles to degrees (roughly, considering Earth's radius)
    degrees_per_mile = 1 / 69  # Approximately
    radius_deg = radius_miles * degrees_per_mile

    # Calculate latitude and longitude bounds for the square around the target point
    min_lon = target_lon - radius_deg
    max_lon = target_lon + radius_deg
    min_lat = target_lat - radius_deg
    max_lat = target_lat + radius_deg

    # Filter the DataFrame to get entries within the specified latitude and longitude bounds
    filtered_df = df[
        (df["lon"] >= min_lon)
        & (df["lon"] <= max_lon)
        & (df["lat"] >= min_lat)
        & (df["lat"] <= max_lat)
        & (df["year"] == currYear)
        & (df["month"] == currMonth)
    ]

    # Get the count of entries within the radius
    count_within_radius = len(filtered_df)

    return count_within_radius


def monthToNum(shortMonth):
    return {
        "Jan": 1,
        "Feb": 2,
        "Mar": 3,
        "Apr": 4,
        "May": 5,
        "Jun": 6,
        "Jul": 7,
        "Aug": 8,
        "Sep": 9,
        "Oct": 10,
        "Nov": 11,
        "Dec": 12,
    }[shortMonth]


def numTofullMonthName(num):
    return {
        1: "January",
        2: "Febuary",
        3: "March",
        4: "April",
        5: "May",
        6: "June",
        7: "July",
        8: "Augest",
        9: "September",
        10: "October",
        11: "November",
        12: "December",
    }[num]

In [22]:
# cast the year column to type int
df['year'] = df['year'].astype(int)
# cast the month column to type int
df['month'] = df['month'].astype(int)

# cast the lat and long columns to type float
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)

In [23]:
# print the count of the number of entries in the df per year
print(df['year'].value_counts())

year
2016    3024
2017    2730
2019    2244
2022    2210
2021    1619
2018    1612
2020     531
2023     466
Name: count, dtype: int64


In [24]:
# use plotly and the us-terrain map box to plot all of the data just on a graph so we can see where all the data points are
# Plotting the map

# filter df for the year 2017
filtered_df = df[df['year'] == 2017]
fig = px.scatter_geo(filtered_df, 
                     lat='Latitude', 
                     lon='Longitude',
                     scope='usa',  # Set the map scope to USA
                     title='Points on Map of USA',
                     centXer=dict(lat=41.90722, lon=-70.0369))


# show the fig
fig.show()

TypeError: scatter_geo() got an unexpected keyword argument 'centXer'

In [None]:
# grab the emotion scores for the first row in the df
emotion_scores = df['Emotion_scores'].values[0]
# pretty prtnt the emotion scores
print(json.dumps(emotion_scores, indent=4, sort_keys=True))

In [None]:
# print the columns of the df
print(df.columns)

# print the unique values in the label column
print(df['label'].unique())

In [None]:
def labelToInt(label):
    return {
        "surprise": 2,
        "joy": 1,
        "neutral": 0,
        "sadness": -1,
        "anger": -2,
        "disgust": -3,
        "fear": -4,

    }[label]

In [None]:
# add a new column to the df called label_int that is the label column but with the values converted to ints
df['label_int'] = df['label'].apply(lambda x: labelToInt(x))

# print out the counts of the label_int column
print(df['label_int'].value_counts())

In [None]:
fig_2=px.density_mapbox(
        df,
        lat="Latitude",
        lon="Longitude",
        z="label_int",
        radius=2,  # Adjust the radius as needed
        center=dict(lat=41.90722, lon=-70.0369),  # Center of the United States
        zoom=4,  # Adjust the zoom level as needed
        mapbox_style="stamen-terrain",  # You can choose a different map style
        title=f"Temperature Heatmap for everything",
        color_continuous_scale="Inferno",  # Adjust the color scale
        range_color=[-4, 2],  # Set the color scale range
        hover_data={
            "Latitude": False,
            "Longitude": False,
            "label_int": True,
        },  # Include temperature in hover data
    )

fig_2.show()

In [None]:
avg_df = df.groupby(['Latitude', 'Longitude'], as_index=False)['label_int'].mean()

# drop all the rows that have label_int == 0
avg_df = avg_df[avg_df['label_int'] != 0]

# Plotting the density map
fig_3 = px.density_mapbox(
    avg_df,  # Use the grouped DataFrame
    lat="Latitude",
    lon="Longitude",
    z="label_int",
    radius=6,
    center=dict(lat=avg_df['Latitude'].mean(), lon=avg_df['Longitude'].mean()),
    zoom=4,
    mapbox_style="stamen-terrain",
    title="Average Emotion on the Trail",
    color_continuous_scale="Inferno",
    range_color=[avg_df['label_int'].min(), avg_df['label_int'].max()],
    hover_data={
        "Latitude": False,
        "Longitude": False,
        "label_int": True,
    }
)

fig_3.show()