In [6]:
"""
clements.ipynb
part 1: Avocado prices
part 2: Sea surface temperatures in departure bay
"""

# Import libraries
import altair as alt
import pandas as pd
import vegafusion as vf

# Define file paths
avocado_csv = 'data/avocado_prices.csv'
temp_csv = 'data/departure_bay_temperature.csv'



In [14]:
# Part 1: Avocado Prices

"""
1. Load Dataset
"""
# Read dataset from CSV file into Pandas DF
df_avocado = pd.read_csv(avocado_csv)
# Print first few rows of dataset
print("Avocado Prices CSV file:")
print(df_avocado)


"""
2. Visulaization
"""
# Enable vegafusion to optimize Altair charts for large datasets
alt.data_transformers.enable('vegafusion')

# Create scatter plot of average price over time
scatter = alt.Chart(df_avocado).mark_point().encode(
    x=alt.X('Date', title="Date"),
    y=alt.Y('average_price', title="Average Price ($)")
).properties(
    title='Average Prices vs. Date'
)

scatter # Display chart



Avocado Prices CSV file:
             Date  average_price  small_hass_volume  large_hass_volume  \
0      2015-12-27           1.33            1036.74           54454.85   
1      2015-12-20           1.35             674.28           44638.81   
2      2015-12-13           0.93             794.70          109149.67   
3      2015-12-06           1.08            1132.00           71976.41   
4      2015-11-29           1.28             941.48           43838.39   
...           ...            ...                ...                ...   
17906  2018-02-04           1.63            2046.96            1529.20   
17907  2018-01-28           1.71            1191.70            3431.50   
17908  2018-01-21           1.87            1191.92            2452.79   
17909  2018-01-14           1.93            1527.63            2981.04   
17910  2018-01-07           1.62            2894.77            2356.13   

       extra_l_hass_volume          type    yr            region  wk  
0              

In [8]:
"""
3. Aggregate data by week
"""
# Group avo data by 'wk' and calc mean for numeric columns
df_weekly = df_avocado.groupby("wk", as_index=False).mean(numeric_only=True)

# print first few rows of aggregated dataset
print("Aggregated data by week:")
print(df_weekly.head())


Aggregated data by week:
   wk  average_price  small_hass_volume  large_hass_volume  \
0   1       1.286887      191342.390307      218561.677642   
1   2       1.330519      180736.856368      205410.353231   
2   3       1.341415      186900.340259      210345.864906   
3   4       1.315047      183383.858160      206762.362052   
4   5       1.253608      250123.511250      258856.288561   

   extra_l_hass_volume      yr  
0         14571.720660  2016.5  
1         13433.732241  2016.5  
2         14890.573939  2016.5  
3         14990.596557  2016.5  
4         19160.686274  2016.5  


In [35]:
"""
4. Plot Aggregated Data"
"""
# Create scatter plot to visualize weekly avg price of avos
scatter_weekly = alt.Chart(df_weekly).mark_point().encode(
    x=alt.X('wk', title="Week"),
    y=alt.Y('average_price', title="Average Price ($)")
            .title("Year")
            .scale(zero=False)
).properties(
    title='Average Prices vs. Date'
)

scatter_weekly.show() # Display chart. I had to force with .show() because chart was not rendering and displaying without it



In [19]:
"""
5. Compute the total volume per week
"""
# Calculate total volume of avos sold per entry by adding all size categories
df_avocado['total_volume'] = (
    df_avocado['small_hass_volume'] + df_avocado['large_hass_volume'] + df_avocado['extra_l_hass_volume'] 
)

# Print updated DF to verify total_volume column has been added
print("Updated Avocado DataFrame with total_volume:")
print(df_avocado)

Updated Avocado DataFrame with total_volume:
             Date  average_price  small_hass_volume  large_hass_volume  \
0      2015-12-27           1.33            1036.74           54454.85   
1      2015-12-20           1.35             674.28           44638.81   
2      2015-12-13           0.93             794.70          109149.67   
3      2015-12-06           1.08            1132.00           71976.41   
4      2015-11-29           1.28             941.48           43838.39   
...           ...            ...                ...                ...   
17906  2018-02-04           1.63            2046.96            1529.20   
17907  2018-01-28           1.71            1191.70            3431.50   
17908  2018-01-21           1.87            1191.92            2452.79   
17909  2018-01-14           1.93            1527.63            2981.04   
17910  2018-01-07           1.62            2894.77            2356.13   

       extra_l_hass_volume          type    yr            region  

In [23]:
"""
6. Create another reduced/aggregated version
"""
# Group dataset by 'wk' and calc mean for each numeric column. The way I previously implemented did not work so had to use this method
avocado_aggregate_2 = df_avocado.groupby("wk", as_index=False).agg({
    'average_price': 'mean',
    'small_hass_volume': 'mean',
    'large_hass_volume': 'mean',
    'extra_l_hass_volume': 'mean',
    'total_volume': 'mean'
})

# Print first few rows of aggregated dataset to verify results
print("Aggregated Avocado DataFrame (avocado_aggregate_2):")
print(avocado_aggregate_2.head())

Aggregated Avocado DataFrame (avocado_aggregate_2):
   wk  average_price  small_hass_volume  large_hass_volume  \
0   1       1.286887      191342.390307      218561.677642   
1   2       1.330519      180736.856368      205410.353231   
2   3       1.341415      186900.340259      210345.864906   
3   4       1.315047      183383.858160      206762.362052   
4   5       1.253608      250123.511250      258856.288561   

   extra_l_hass_volume   total_volume  
0         14571.720660  424475.788608  
1         13433.732241  399580.941840  
2         14890.573939  412136.779104  
3         14990.596557  405136.816769  
4         19160.686274  528140.486085  


In [36]:
"""
7. Create another scatter plot
"""
# Create scatter plot to visualize avg total avo volume by week
scatter_aggregate_2 = alt.Chart(avocado_aggregate_2).mark_point().encode(
    x=alt.X('wk', title='Week'),
    y=alt.Y('total_volume')
            .title("Total Volume (lbs)")
            .scale(zero=False)
).properties(
    title="Average Total Volume of Avocados by Week"
)

scatter_aggregate_2 # Display chart

In [25]:
# Part 2: Sea surface temperatures in departure bay

"""
1. Load Dataset
"""
# Load sea surface temp dataset while skipping first two metadata rows
df_temp = pd.read_csv(temp_csv, skiprows=2) #skip first two rows without data

# Print first few rows of dataset to verify it loaded correctly
print("Sea Surface Temperatures in Departure Bay")
print(df_temp)

Sea Surface Temperatures in Departure Bay
     Year  Jan  Feb  Mar  Apr   May   Jun   Jul   Aug   Sep   Oct  Nov  Dec
0    1914  7.2  NaN  NaN  NaN   NaN   NaN   NaN   NaN  11.1  10.0  7.3  6.3
1    1915  5.6  6.6  7.5  9.0   9.9  12.5  14.7  15.8  14.0   8.2  4.4  4.1
2    1916  1.2  0.1  3.5  6.5   8.0  12.0  13.1  14.0  11.4   7.6  5.4  3.5
3    1917  3.8  2.8  4.4  5.4   8.3  11.0  13.7  12.2  10.0   8.6  7.0  4.9
4    1918  3.7  3.9  4.6  6.0   9.3  11.2  13.1  14.5  13.8   9.1  6.7  5.0
..    ...  ...  ...  ...  ...   ...   ...   ...   ...   ...   ...  ...  ...
100  2014  4.4  3.1  3.7  7.7  10.1  12.0  15.9  16.8  11.7  10.6  7.1  7.1
101  2015  6.3  8.0  8.0  8.9  11.0  15.5  13.8  13.4  11.6  11.3  8.1  6.8
102  2016  6.0  7.1  8.4  9.8  13.0  14.2  14.6  14.6  12.6  10.8  8.2  5.5
103  2017  5.6  4.8  7.1  7.9  10.5  12.4  15.3  15.3  13.1  10.2  8.8  6.9
104  2018  6.2  6.0  7.1  8.2   NaN   NaN   NaN   NaN   NaN   NaN  NaN  NaN

[105 rows x 13 columns]


In [26]:
"""
2. Make Tidy
"""
# Reshape dataset from wide format to long format using melt fxn
tidy_temp = df_temp.melt(id_vars=["Year"], var_name="Month", value_name='Temperature')

# Print the first few rows of the tidy dataset to verify the transformation
print("Data in tidy format:")
print(tidy_temp)

Data in tidy format:
      Year Month  Temperature
0     1914   Jan          7.2
1     1915   Jan          5.6
2     1916   Jan          1.2
3     1917   Jan          3.8
4     1918   Jan          3.7
...    ...   ...          ...
1255  2014   Dec          7.1
1256  2015   Dec          6.8
1257  2016   Dec          5.5
1258  2017   Dec          6.9
1259  2018   Dec          NaN

[1260 rows x 3 columns]


In [33]:
"""
3. Plot to compare
"""
# Filter for November data only
november_temp = tidy_temp[tidy_temp["Month"] == "Nov"]

# Create November scatter plot for visualization
scatter_november = alt.Chart(november_temp).mark_point().encode(
    x=alt.X("Year")
            .title("Year")
            .scale(zero=False),
    y=alt.Y("Temperature")
            .title("Temperature (Celsius)")
            .scale(zero=False)
).properties(
    title="November Sea Surface Temperature Over Time"
)

scatter_november # Display chart

In [31]:
"""
4. Faceted Visualization
"""
# Create faceted scatter plot to visualize temperature trends for each month
all_temp_plot = alt.Chart(tidy_temp).mark_point().encode(
    x=alt.X("Year")
            .title("Year")
            .scale(zero=False),
    y=alt.Y("Temperature")
            .title("Temperature (Celsius)")
            .scale(zero=False)
).facet(
    'Month',
    columns=4
)

all_temp_plot # Display faceted plot