In [2]:
import streamlit as st
import pandas as pd
import plotly.express as px

# Load the data
customized_report = pd.read_csv('total.csv')
normalized_data = pd.read_csv('per_student.csv')

In [3]:
main_unit = 'GHG MTCDE'

In [4]:
customized_report = customized_report[['Fiscal Year', 'Scope', 'Source', main_unit]]
normalized_data = normalized_data[['Fiscal Year', 'Scope', 'Source', main_unit]]


In [5]:
print(customized_report[main_unit].dtype)
print(normalized_data[main_unit].dtypes)

object
float64


In [6]:
customized_report[main_unit] = customized_report[main_unit].str.replace(',', '').astype(float)


In [7]:
# Combine all sources with 'commuting' in their name into a single source named 'Commuting'
customized_report['Source'] = customized_report['Source'].apply(lambda x: 'Commuting' if 'commuting' in x.lower() else x)
normalized_data['Source'] = normalized_data['Source'].apply(lambda x: 'Commuting' if 'commuting' in x.lower() else x)

# Combine all sources with 'co-gen' in their name into a single source named 'Co-gen Plant'
def combine_sources(source):
    if isinstance(source, str):
        if 'co-gen' in source.lower():
            return 'Co-gen Plant'
        else:
            return source
    else:
        return source

customized_report['Source'] = customized_report['Source'].apply(combine_sources)
normalized_data['Source'] = normalized_data['Source'].apply(combine_sources)



In [8]:
# Sum up the 'Commuting' rows for each year
customized_report = customized_report.groupby(['Fiscal Year', 'Scope', 'Source'], as_index=False)[main_unit].sum()
normalized_data = normalized_data.groupby(['Fiscal Year', 'Scope', 'Source'], as_index=False)[main_unit].sum()


In [9]:
cleaned_totals = customized_report
cleaned_normalized = normalized_data


In [10]:
print(cleaned_totals['Fiscal Year'].dtype)

int64


In [11]:
# Calculate the top 6 sources with highest combined emissions over time
top_sources_totals = cleaned_totals.groupby('Source')[main_unit].sum().nlargest(6).index
top_sources_normalized = cleaned_normalized.groupby('Source')[main_unit].sum().nlargest(6).index

# Assign a new column 'source_level' to each dataframe
# The value is 1 if the source is in the top 6 combined emissions over time, and 0 otherwise
cleaned_totals['source_level'] = cleaned_totals['Source'].apply(lambda x: 1 if x in top_sources_totals else 0)
cleaned_normalized['source_level'] = cleaned_normalized['Source'].apply(lambda x: 1 if x in top_sources_normalized else 0)


In [12]:
# Get unique sources from the dataframe
sources = cleaned_totals['Source'].unique()

# Sort sources by their length
sorted_sources = sorted(sources, key=len)

# Print the sorted sources
for source in sorted_sources:
    print(source)


FERA
Commuting
T&D Losses
Solid Waste
Co-gen Plant
Fertilizer & Animals
Direct Transportation
Purchased Electricity
Other On-Campus Stationary
Directly Financed Air Travel
Other Directly Financed Travel


In [13]:
# Replace source names in both dataframes
cleaned_totals['Source'] = cleaned_totals['Source'].replace({
    'Fertilizer & Animals': 'Fertilizer',
    'Co-gen steam': 'Co-gen Plant',
    'Direct Transportation': 'University Fleet',
    'Other On-Campus Stationary': 'Propane & Natural Gas',
    'Directly Financed Air Travel': 'Air Travel',
    'Solid Waste': 'Landfill Waste',
    'Other Directly Financed Travel': 'Bus Travel'
})
cleaned_normalized['Source'] = cleaned_normalized['Source'].replace({
    'Fertilizer & Animals': 'Fertilizer',
    'Co-gen steam': 'Co-gen Plant',
    'Direct Transportation': 'University Fleet',
    'Other On-Campus Stationary': 'Propane & Natural Gas',
    'Directly Financed Air Travel': 'Air Travel',
    'Solid Waste': 'Landfill Waste',
    'Other Directly Financed Travel': 'Bus Travel'
})



In [14]:
# Create a DataFrame with unique sources
sources_df = pd.DataFrame(sorted_sources, columns=['Source'])

# Save the DataFrame to a csv file
sources_df.to_csv('blurbs.csv', index=False)
