## Capstone 
Goal is to make the files with 24 hour columns horizontally into a more vertical format

## Weather File

In [None]:
import pandas as pd
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# File path
file_path = 'Weather_BFY25.xlsx'

# Load specific sheets into separate DataFrames
temp_data = pd.read_excel(file_path, sheet_name='Temp')
humidity_data = pd.read_excel(file_path, sheet_name='Humidity')
windspeed_data = pd.read_excel(file_path, sheet_name='WindSpeed')
thi_data = pd.read_excel(file_path, sheet_name='THI')
windchill_data = pd.read_excel(file_path, sheet_name='WindChill')

# Load the other sheets (Rainfall and DailyWthr) - NOT CURRENTLY MODIFYING SINCE NOT IN HORIZONTAL FORMAT
rainfall_data = pd.read_excel(file_path, sheet_name='Rainfall')
daily_weather_data = pd.read_excel(file_path, sheet_name='DailyWthr')

# Print the first few rows for verification
print("Temp Data:")
print(temp_data.head(2))

print("\nHumidity Data:")
print(humidity_data.head(2))

print("\nWindspeed Data:")
print(windspeed_data.head(2))

print("\nTHI Data:")
print(thi_data.head(2))

print("\nWindchill Data:")
print(windchill_data.head(2))

print("\nRainfall Data:")
print(rainfall_data.head(2))

print("\nDaily Weather Data:")
print(daily_weather_data.head(2))

# Store all sheets in a dictionary for easy reference and analysis later
all_weather_sheets = {
    "Temp": temp_data,
    "Humidity": humidity_data,
    "Windspeed": windspeed_data,
    "THI": thi_data,
    "Windchill": windchill_data,
    "Rainfall": rainfall_data,
    "DailyWthr": daily_weather_data
}


In [10]:
# List of sheets that require transformation
sheets_to_transform = ['Temp', 'Humidity', 'Windspeed', 'THI', 'Windchill']

# Dictionary to store transformed DataFrames
transformed_weather_sheets = {}

# Transform the data for each specified sheet
for sheet_name in sheets_to_transform:
    # Access the sheet from the previously loaded dictionary
    data = all_weather_sheets[sheet_name]
    
    # Reshape the data from wide to long format
    data_long = data.melt(
        id_vars=['StationID', 'ConceptID', 'Dt'],  # Columns to keep as is
        value_vars=[f'H{i}' for i in range(1, 25)],  # Columns to unpivot
        var_name='Hour',  # Name of the new column for hours
        value_name=sheet_name  # Name of the new column for the values
    )
    
    # Convert the 'Hour' column from 'H1', 'H2', etc., to integers (1-24)
    data_long['Hour'] = data_long['Hour'].str.extract('H(\d+)').astype(int)
    
    # Sort the data by date (`Dt`) and hour (`Hour`)
    data_long = data_long.sort_values(by=['Dt', 'Hour']).reset_index(drop=True)
    
    # Store the transformed DataFrame
    transformed_weather_sheets[sheet_name] = data_long

    # Print a preview of the transformed data for verification
    print(f"\nTransformed Data for {sheet_name}:")
    print(data_long.head(10))




Transformed Data for Temp:
  StationID ConceptID         Dt  Hour  Temp
0       LAK       DBT 1989-01-01     1  65.0
1       LAK       DBT 1989-01-01     2  66.0
2       LAK       DBT 1989-01-01     3  65.0
3       LAK       DBT 1989-01-01     4  64.0
4       LAK       DBT 1989-01-01     5  64.0
5       LAK       DBT 1989-01-01     6  64.0
6       LAK       DBT 1989-01-01     7  65.0
7       LAK       DBT 1989-01-01     8  65.0
8       LAK       DBT 1989-01-01     9  67.0
9       LAK       DBT 1989-01-01    10  75.0

Transformed Data for Humidity:
  StationID ConceptID         Dt  Hour  Humidity
0       LAK       DBT 1999-01-01     1      89.0
1       LAK       DBT 1999-01-01     2      92.0
2       LAK       DBT 1999-01-01     3      95.5
3       LAK       DBT 1999-01-01     4      97.5
4       LAK       DBT 1999-01-01     5      98.5
5       LAK       DBT 1999-01-01     6      98.5
6       LAK       DBT 1999-01-01     7      98.5
7       LAK       DBT 1999-01-01     8      98.5
8   

## Merging the changed variables together from 2000 onwards

In [12]:
# Import necessary libraries
import pandas as pd

# Specify the start date for filtering
start_date = "2000-01-01"

# Filter and prepare all transformed data for merging
for sheet_name in transformed_weather_sheets:
    # Filter rows where the date is >= January 1, 2000
    transformed_weather_sheets[sheet_name] = transformed_weather_sheets[sheet_name][
        transformed_weather_sheets[sheet_name]['Dt'] >= start_date
    ]

# Merge all DataFrames on common columns: 'StationID', 'ConceptID', 'Dt', 'Hour'
merged_data = transformed_weather_sheets['Temp']

for sheet_name in ['Humidity', 'Windspeed', 'THI', 'Windchill']:
    merged_data = pd.merge(
        merged_data,
        transformed_weather_sheets[sheet_name],
        on=['StationID', 'ConceptID', 'Dt', 'Hour'],
        how='inner'
    )

# Remove 'StationID' and 'ConceptID' columns from the merged data
merged_data = merged_data.drop(columns=['StationID', 'ConceptID'])

# Display the first few rows of the updated data
print(merged_data.head())

# Save output
merged_data.to_csv('merged_weather_data_verticalFormat_cleaned.csv', index=False)



  StationID ConceptID         Dt  Hour       Temp   Humidity Windspeed  \
0       LAK       DBT 2000-01-01     1  58.871429  88.142857  0.833333   
1       LAK       DBT 2000-01-01     2  58.257143  88.142857      0.75   
2       LAK       DBT 2000-01-01     3  58.071429  89.714286  0.833333   
3       LAK       DBT 2000-01-01     4  58.300000  90.000000  0.833333   
4       LAK       DBT 2000-01-01     5  58.371429  89.857143  0.666667   

         THI  Windchill  
0  59.650000  57.383333  
1  59.216667  56.683333  
2  58.950000  56.133333  
3  59.266667  56.300000  
4  59.266667  56.233333  


### Rainfall and DailyWthr columns combining and cleaning up since only daily values

In [14]:
import pandas as pd

# Ensure 'Rainfall' and 'DailyWthr' are loaded into the DataFrame from the previous script
# Assuming 'rainfall_data' and 'daily_weather_data' are already loaded

# Filter both DataFrames to start from Jan 1, 2000
rainfall_data_filtered = rainfall_data[pd.to_datetime(rainfall_data['Dt']) >= '2000-01-01'].copy()
daily_weather_data['Dt'] = pd.to_datetime(daily_weather_data[['Year', 'Month', 'Day']])
daily_weather_data_filtered = daily_weather_data[daily_weather_data['Dt'] >= '2000-01-01'].copy()

# Select relevant columns from DailyWthr
daily_weather_data_filtered = daily_weather_data_filtered[['Dt', 'AvgDB', 'HDD55', 'HDD60', 'HDD65', 'CDD65', 'CDD70', 'CDD75']]

# Merge the two DataFrames on the 'Dt' column
merged_weather_data = pd.merge(
    rainfall_data_filtered,
    daily_weather_data_filtered,
    on='Dt',
    how='inner'  # Use 'inner' to ensure only dates that exist in both datasets are included
)

# Reset index for better readability
merged_weather_data.reset_index(drop=True, inplace=True)

# Display the merged DataFrame
print(merged_weather_data.head())

# Save the data
merged_weather_data.to_csv("merged_weather_data_rain-daily.csv", index=False)



          Dt  Rainfall  dbinRain      AvgDB  HDD55     HDD60     HDD65  \
0 2000-01-01   0.00500         1  65.441667    0.0  0.000000  0.000000   
1 2000-01-02   0.00125         1  65.688690    0.0  0.000000  0.000000   
2 2000-01-03   0.00125         1  65.799405    0.0  0.000000  0.000000   
3 2000-01-04   0.00125         1  69.077381    0.0  0.000000  0.000000   
4 2000-01-05   0.00000         0  58.308929    0.0  1.691071  6.691071   

      CDD65  CDD70  CDD75  
0  0.441667    0.0    0.0  
1  0.688690    0.0    0.0  
2  0.799405    0.0    0.0  
3  4.077381    0.0    0.0  
4  0.000000    0.0    0.0  
