# Question 4: Data Visualization

CS 5304 - Data Science in the Wild, Assignment 2

**Author**: Yufan Zhang (yz2894)


In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [2]:
# Read the data
weather_df = pd.read_csv('data/weather.csv')

# Convert time to datetime
weather_df["time"] = pd.to_datetime(weather_df["time"])

# Calculate Fahrenheit temperature
weather_df["Ftemp"] = (weather_df["Ktemp"] - 273.15) * 9/5 + 32

weather_df.head()

Unnamed: 0,time,longitude,latitude,Ktemp,Ftemp
0,1950-01-01 09:00:00,286,40.75,274.39734,34.245212
1,1950-01-02 09:00:00,286,40.75,277.07593,39.066674
2,1950-01-03 09:00:00,286,40.75,280.8009,45.77162
3,1950-01-04 09:00:00,286,40.75,287.1401,57.18218
4,1950-01-05 09:00:00,286,40.75,285.65262,54.504716


## Part A

In [3]:
# Group by month and year, then calculate the average temperature for each month
weather_df['month'] = weather_df['time'].dt.month
weather_df['year'] = weather_df['time'].dt.year
monthly_avg_temp = weather_df.groupby(['year', 'month'])['Ftemp'].mean().reset_index()

print(monthly_avg_temp['Ftemp'].describe())

monthly_avg_temp

count    864.000000
mean      53.603423
std       16.155813
min       20.699340
25%       38.853501
50%       53.466899
75%       69.194660
max       80.360747
Name: Ftemp, dtype: float64


Unnamed: 0,year,month,Ftemp
0,1950,1,39.994274
1,1950,2,31.838770
2,1950,3,35.444411
3,1950,4,47.381065
4,1950,5,57.668071
...,...,...,...
859,2021,8,76.812806
860,2021,9,70.573598
861,2021,10,61.972971
862,2021,11,45.969692


In [11]:
# Plotting with an interactive slider for the year
fig = px.line(
    monthly_avg_temp,
    x="month",
    y="Ftemp",
    labels={"Ftemp": "Average Temperature (°F)", "month": "Month", "year": "Year"},
    title="Average Monthly Temperature Near Cornell Tech",
    animation_frame="year",
)

# Update the x-axis to show month names
fig.update_xaxes(
    tickvals=list(range(1, 13)),
    ticktext=[
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ],
)

# Update the scale of the y-axis
fig.update_yaxes(range=[20, 81])

fig.update_layout(width=800, height=500)

fig.write_image("plots/partA.png")
fig.write_html("plots/partA.html")
fig.show()

To view the interactive plot, please visit the following link: [yufanbruce.com/dsw/posts/a2](https://yufanbruce.com/dsw/posts/a2/#Question4).

## Part B

In [5]:
# Group by year to calculate average annual temperature
annual_avg_temp = weather_df.groupby("year")["Ftemp"].mean().reset_index()

# Find the first year where the average temperature exceeds 55 degrees Fahrenheit
first_year_over_55 = int(annual_avg_temp[annual_avg_temp["Ftemp"] > 55].iloc[0]["year"])

first_year_over_55

1953

In [10]:
# Plot the annual average temperature
fig = px.line(
    annual_avg_temp,
    x="year",
    y="Ftemp",
    labels={"Ftemp": "Average Temperature (°F)", "year": "Year"},
    title="Average Annual Temperature Near Cornell Tech",
)

fig.add_hline(
    y=55, line_dash="dot", annotation_text="55°F", annotation_position="bottom right"
)

fig.add_vline(
    x=first_year_over_55,
    line_dash="dot",
    annotation_text=f"{first_year_over_55}",
    annotation_position="top right",
)

fig.update_layout(width=1000, height=500)
fig.write_image("plots/partB.png")
fig.write_html("plots/partB.html")
fig.show()

### When is the first year where the year's average temperature passes 55 degrees

- Answer: 1953

## Part C

In [12]:
# Defining seasons based on months for simplicity
def season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Assign the season to each row
weather_df['Season'] = weather_df['time'].dt.month.apply(season)

# Group by year and season to get the average temperature for each season of each year
seasonal_avg = weather_df.groupby([weather_df['time'].dt.year, 'Season'])['Ftemp'].mean().unstack()


seasonal_avg.head()

Season,Fall,Spring,Summer,Winter
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1950,56.313759,46.825205,72.040273,35.589882
1951,55.635367,51.548033,72.651252,35.065961
1952,55.928324,50.52305,74.990355,35.864975
1953,57.691166,51.529162,73.912527,37.691326
1954,57.033496,50.525571,72.430418,33.46723


In [9]:
fig = make_subplots(
    rows=1, cols=1
)

colors = {"Winter": "blue", "Spring": "green", "Summer": "red", "Fall": "orange"}

for season, color in colors.items():
    fig.add_trace(
        go.Scatter(
            x=seasonal_avg.index,
            y=seasonal_avg[season],
            mode="lines",
            name=season,
            line=dict(color=color),
        ),
        row=1,
        col=1,
    )

fig.update_layout(
    title_text="Average Seasonal Temperatures over Years at Cornell Tech",
    xaxis_title="Year",
    yaxis_title="Temperature (°F)",
)

fig.update_layout(width=1000, height=500)

fig.write_image("plots/partC.png")
fig.write_html("plots/partC.html")
fig.show()

### Write-up:

The visualization presents four line charts, each corresponding to a different season (Winter, Spring, Summer, and Fall), showing the average temperatures over the years from 1950 to the present near Cornell Tech. Each seasonal line chart progresses in a relatively parallel fashion, suggesting that while there may be some warming or cooling trends, the seasonal temperature patterns maintain a consistent relationship to each other over the decades.