In [None]:
import pandas as pd
import matplotlib.pyplot as plt

weather = pd.read_csv('aysa_data.csv', index_col="Fecha")
weather

## Cleaning data

In [None]:
weather.apply(pd.isnull).sum()/weather.shape[0]

### Copy just the relevant columns

In [None]:
core_weather = weather[["Temperatura", "Humedad"]].copy()
core_weather

In [552]:
core_weather[pd.isnull(core_weather["Temperatura"])]
core_weather[pd.isnull(core_weather["Humedad"])]



Unnamed: 0_level_0,Temperatura,Humedad
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1


In [551]:
# Fill NaN values in 'Temperatura' with the previous value
core_weather["Temperatura"].fillna(method='ffill', inplace=True)
core_weather["Humedad"].fillna(method='ffill', inplace=True)
core_weather[pd.isnull(core_weather["Humedad"])]
core_weather[pd.isnull(core_weather["Temperatura"])]



Unnamed: 0_level_0,Temperatura,Humedad
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1


In [None]:
core_weather.dtypes

In [None]:
# Convert 'Temperatura' and 'Humedad' to numeric
core_weather["Temperatura"] = core_weather["Temperatura"].str.replace(',', '.').astype(float)
core_weather["Humedad"] = core_weather["Humedad"].str.replace(',', '.').astype(float)

# Check the data types again
core_weather.dtypes
#core_weather


In [None]:
core_weather.index

In [None]:
core_weather.index = pd.to_datetime(core_weather.index)
core_weather.index.month


## Start Analysis

In [None]:
# just take one temperature and humidity per day
daily_temp = core_weather.between_time("14:00", "14:00").resample('D').first()
daily_temp


In [None]:
## Plot daily temperatures at 14:00
plt.figure(figsize=(12, 6))
plt.plot(daily_temp.index, daily_temp['Temperatura'], label='Temperature at 14:00', color='blue')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.title('Daily Temperatures at 14:00')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
## Plot daily temperatures at 14:00
plt.figure(figsize=(12, 6))
plt.plot(daily_temp.index, daily_temp['Humedad'], label='Humedad at 14:00', color='red')
plt.xlabel('Date')
plt.ylabel('Humedad')
plt.title('Daily Humedad at 14:00')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
daily_temp.index.year.value_counts().sort_index()

In [None]:
## Calculate the monthly average temperature for each year
monthly_avg_temp = daily_temp.resample('M').mean()

# Group by year and month for plotting
monthly_avg_temp['Year'] = monthly_avg_temp.index.year
monthly_avg_temp['Month'] = monthly_avg_temp.index.month
monthly_avg_grouped = monthly_avg_temp.groupby(['Year', 'Month']).mean()

# Print the monthly average temperatures
print(monthly_avg_grouped)


# Plot monthly average temperatures
fig, ax = plt.subplots(figsize=(12, 6))

for year, group in monthly_avg_temp.groupby('Year'):
    ax.plot(group.index.month, group['Temperatura'], label=str(year))

ax.set_xticks(range(1, 13))
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
ax.set_xlabel('Month')
ax.set_ylabel('Temperature (°C)')
ax.set_title('Monthly Average Temperature for Each Year')
ax.legend(title='Year')
ax.grid(True)
plt.show()

In [None]:
# Create a new column 'target' containing the next day's temperature
daily_temp["target"] = daily_temp["Temperatura"].shift(-1)
daily_temp = daily_temp.iloc[:-1,:].copy()
daily_temp

### Train the data

In [None]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [None]:
predictors = ["Temperatura", "Humedad"]


In [None]:
train = daily_temp.loc[:'31/08/2021']
# Forward fill NaN values in the train DataFrame
train.fillna(method='ffill', inplace=True)


In [None]:
test = daily_temp.loc['01/09/2021':]
# Forward fill NaN values in the train DataFrame
test.fillna(method='ffill', inplace=True)


In [None]:
print(train[pd.isnull(train).any(axis=1)])
train.dropna(inplace=True)


In [None]:
reg.fit(train[predictors], train["target"])

In [None]:
predictions = reg.predict(test[predictors])

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mean_absolute_error(test["target"], predictions)

In [None]:
combined = pd.concat([test["target"], pd.Series(predictions, index=test.index)], axis=1) 
combined.columns = ["actual", "predicted"]
combined

In [None]:
combined.plot()

In [None]:
reg.coef_

## Create future predictions

In [None]:
print(daily_temp)

In [None]:
def create_predictions(predictors, daily_temp, reg):
    train = daily_temp.loc[:'31/12/2021']
    test = daily_temp.loc['01/01/2022':]
    reg.fit(train[predictors], train["target"])
    predictions = reg.predict(test[predictors])
    error = mean_absolute_error(test["target"], predictions)
    combined = pd.concat([test["target"], pd.Series(predictions, index=test.index)], axis=1)
    combined.columns = ["actual", "predicted"]
    return error, combined


In [None]:
daily_temp["month_max"] = daily_temp["Temperatura"].rolling(30).max()
daily_temp["month_min"] = daily_temp["Temperatura"].rolling(30).min()

#daily_temp


In [None]:
daily_temp["month_day_max"] = daily_temp["month_max"] / daily_temp["Temperatura"]

In [None]:
#daily_temp["max_min"] = daily_temp

In [None]:
daily_temp = daily_temp.iloc[30:,:].copy()
daily_temp

In [None]:
predictors = ["Temperatura", "Humedad", "month_max", "month_day_max"]

In [None]:
error, combined = create_predictions(predictors, daily_temp, reg)
error

In [None]:
combined.plot()

In [None]:
# Define a function to compute the expanding mean
def expanding_monthly_mean(group):
    return group.expanding().mean()

# Apply the expanding mean function to each group of months
monthly_avg = daily_temp["Temperatura"].groupby(daily_temp.index.to_period('M')).apply(expanding_monthly_mean)

# Flatten the index and reindex back to the original DataFrame index
monthly_avg.index = monthly_avg.index.droplevel(0)

# Add the calculated monthly average as a new column in the daily_temp DataFrame
daily_temp["monthly_avg"] = monthly_avg

# Display the result
print(daily_temp.head(15))


In [None]:
# Define a function to compute the expanding mean per day of the year
def expanding_day_of_year_mean(group):
    return group.expanding().mean()

# Apply the expanding mean function to each group of days of the year
day_of_year_avg = daily_temp["Temperatura"].groupby(daily_temp.index.day_of_year).apply(expanding_day_of_year_mean)

# Flatten the index and reindex back to the original DataFrame index
day_of_year_avg.index = day_of_year_avg.index.droplevel(0)

# Add the calculated day of the year average as a new column in the daily_temp DataFrame
daily_temp["day_of_year_avg"] = day_of_year_avg

# Display the result
#print(daily_temp.head(15))


In [None]:
predictors = ["Temperatura", "Humedad", "month_max", "monthly_avg", "day_of_year_avg"]

In [None]:
error, combined = create_predictions(predictors, daily_temp, reg)

error

## Diagnosis

In [None]:
reg.coef_

In [None]:
daily_temp.corr()["target"]

In [None]:
combined["diff"] = (combined["actual"] - combined["predicted"]).abs()
combined.sort_values("diff", ascending=False).head()

In [None]:
# Function to predict the average temperature for a given future date's week using historical monthly averages
def predict_weekly_average_future(date_str, daily_temp):
    try:
        # Parse the input date
        input_date = pd.to_datetime(date_str, format='%d/%m/%Y')
    except ValueError:
        raise ValueError("Invalid date format. Please use dd/mm/yyyy.")

    # Extract the month from the input date
    target_month = input_date.month

    # Calculate the mean temperature for the target month using historical data
    monthly_avg_temp = daily_temp[daily_temp.index.month == target_month]["monthly_avg"].mean()

    # Assume that the weekly average temperature for the given week will be the same as the historical monthly average
    return monthly_avg_temp

# Main function to interact with the user
def main():
    date_str = input("Enter a date (dd/mm/yyyy) to predict the average temperature for the following week: ")
    try:
        weekly_avg = predict_weekly_average_future(date_str, daily_temp)
        print(f"The predicted average temperature for the week starting on {date_str} is: {weekly_avg:.2f} °C")
    except ValueError as e:
        print(e)

# Uncomment this line to test directly
#main()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import datetime

# Function to predict temperature for a specific date using the trained model
def predict_specific_date(date_str, predictors, daily_temp, reg):
    try:
        # Parse the input date
        input_date = pd.to_datetime(date_str, format='%d/%m/%Y')
    except ValueError:
        raise ValueError("Invalid date format. Please use dd/mm/yyyy.")

    # Prepare future predictors based on the available data
    last_known_values = daily_temp.iloc[-1][predictors]
    future_df = pd.DataFrame([last_known_values], index=[input_date], columns=predictors)

    # Fill any missing values with forward fill
    future_df.fillna(method='ffill', inplace=True)

    # Predict the temperature using the trained model
    predicted_temp = reg.predict(future_df[predictors])[0]
    
    return predicted_temp

# Historical monthly average prediction function
def predict_weekly_average_future(date_str, daily_temp):
    try:
        # Parse the input date
        input_date = pd.to_datetime(date_str, format='%d/%m/%Y')
    except ValueError:
        raise ValueError("Invalid date format. Please use dd/mm/yyyy.")

    # Extract the month from the input date
    target_month = input_date.month

    # Calculate the mean temperature for the target month using historical data
    monthly_avg_temp = daily_temp[daily_temp.index.month == target_month]["monthly_avg"].mean()

    return monthly_avg_temp

# Compare the model's prediction with the historical average prediction
def compare_predictions(date_str, predictors, daily_temp, reg):
    model_prediction = predict_specific_date(date_str, predictors, daily_temp, reg)
    historical_prediction = predict_weekly_average_future(date_str, daily_temp)

    print(f"Prediction using the model: {model_prediction:.2f} °C")
    print(f"Prediction using historical monthly averages: {historical_prediction:.2f} °C")

# Example usage
date_str = "19/12/2026"
predictors = ["Temperatura", "Humedad", "month_max", "monthly_avg", "day_of_year_avg"]
compare_predictions(date_str, predictors, daily_temp, reg)


In [None]:
daily_temp

In [None]:
def examine_last_predictors(predictors, daily_temp):
    last_known_values = daily_temp.iloc[-1][predictors]
    print("Last known predictors used:")
    print(last_known_values)

# Example usage
examine_last_predictors(predictors, daily_temp)


In [None]:
# Review historical monthly averages for December
def review_monthly_averages(month, daily_temp):
    historical_data = daily_temp[daily_temp.index.month == month]["monthly_avg"]
    print(f"Historical monthly averages for month {month}:")
    print(historical_data)

# Example usage for December
review_monthly_averages(12, daily_temp)


In [None]:
# Recalculate the monthly average
def calculate_monthly_avg(daily_temp):
    monthly_avg = daily_temp["Temperatura"].groupby(daily_temp.index.to_period('M')).apply(lambda x: x.expanding().mean())
    monthly_avg.index = monthly_avg.index.droplevel(0)
    daily_temp["monthly_avg"] = monthly_avg
    return daily_temp

daily_temp = calculate_monthly_avg(daily_temp)
daily_temp


In [None]:
def retrain_model(daily_temp, predictors):
    train = daily_temp.loc[:'2021-08-31']
    test = daily_temp.loc['2021-09-01':]
    reg = Ridge(alpha=0.1)
    train.fillna(method='ffill', inplace=True)
    reg.fit(train[predictors], train["target"])
    return reg

# Retrain the model with the refined features
reg = retrain_model(daily_temp, predictors)


In [None]:
# Re-comparison after retraining
compare_predictions(date_str, predictors, daily_temp, reg)
