## Requisitos 10-12: 

10. **Modelos de Clasificación**: Desarrollar y optimizar modelos de clasificación (como árboles de decisión, SVM, k-NN), utilizando los métodos adecuados de validación y evaluación.

11. **Validación de Modelos**: Seleccionar los mejores modelos mediante validación cruzada con k-fold, para asegurar la robustez y generalización de los modelos creados.
    
12. **Uso de Scraping para Variables Exógenas**: El proyecto debe incluir el uso de técnicas de web scraping para obtener variables adicionales de fuentes externas que aporten valor a los datos originales del proyecto.

In [1]:

import pandas as pd
import os

#cargar datos
current_path = os.getcwd()
aguacate_index = current_path.find("uoc-proyecto3")
if aguacate_index != -1:
        project_root = current_path[:aguacate_index + len("uoc-proyecto3")]
else:
    raise FileNotFoundError("The directory 'uoc-proyecto3' was not found in the path.")
data_path = os.path.join(project_root, 'data', 'Avocado_HassAvocadoBoard_20152023v1.0.1.csv')
dataset_avocado_original_df = pd.read_csv(data_path)

region_classification = {
    'Albany': 'City',
    'Atlanta': 'City',
    'BaltimoreWashington': 'Region',
    'BirminghamMontgomery': 'Region',
    'Boise': 'City',
    'Boston': 'City',
    'BuffaloRochester': 'Region',
    'California': 'GreaterRegion',
    'Charlotte': 'City',
    'Chicago': 'City',
    'CincinnatiDayton': 'Region',
    'Columbus': 'City',
    'DallasFtWorth': 'Region',
    'Denver': 'City',
    'Detroit': 'City',
    'GrandRapids': 'City',
    'GreatLakes': 'GreaterRegion',
    'HarrisburgScranton': 'Region',
    'HartfordSpringfield': 'Region',
    'Houston': 'City',
    'Indianapolis': 'City',
    'Jacksonville': 'City',
    'LasVegas': 'City',
    'LosAngeles': 'City',
    'Louisville': 'City',
    'Miami': 'City',
    'MiamiFtLauderdale': 'Region',
    'Midsouth': 'GreaterRegion',
    'Nashville': 'City',
    'NewOrleans': 'City',
    'NewYork': 'City',
    'Northeast': 'GreaterRegion',
    'NorthernNewEngland': 'Region',
    'Orlando': 'City',
    'PeoriaSpringfield': 'Region',
    'Philadelphia': 'City',
    'PhoenixTucson': 'Region',
    'Pittsburgh': 'City',
    'Plains': 'GreaterRegion',
    'Portland': 'City',
    'Providence': 'City',
    'RaleighGreensboro': 'Region',
    'RichmondNorfolk': 'Region',
    'Roanoke': 'City',
    'Sacramento': 'City',
    'SanDiego': 'City',
    'SanFrancisco': 'City',
    'Seattle': 'City',
    'SouthCarolina': 'Region',
    'SouthCentral': 'GreaterRegion',
    'Southeast': 'GreaterRegion',
    'Spokane': 'City',
    'StLouis': 'City',
    'Syracuse': 'City',
    'Tampa': 'City',
    'Toledo': 'City',
    'TotalUS': 'TotalUS',
    'West': 'GreaterRegion',
    'WestTexNewMexico': 'Region',
    'Wichita': 'City'
}

def map_regions(original_data: pd.DataFrame, region_map: dict, guardar: bool = False) -> pd.DataFrame:
    """
    Asigna la clasificación de regiones y ciudades al dataframe original de aguacate
    en una nueva columna region_type

    Parametros:
    - original_data: pd.DataFrame-  Datos originales avocado.csv
    - region_map: Dict[str, str] - El mapping de agrupaciones
    - guardar: Boolean - True or False para guardar nuevo csv o no

    Regresa:
    - pd.DataFrame: Dataframe actualizado de los datos originales
    """

    path_salida = "data/avocado_with_region_types.csv"
    nuevo_aguacate_df = original_data.copy()
    nuevo_aguacate_df['region_type'] = nuevo_aguacate_df['region'].map(region_map)
    if guardar:
        print(f"Guardando archivo .csv en /data/  ...")
        nuevo_aguacate_df.to_csv(path_salida, index=False)
    else:
        pass

    return nuevo_aguacate_df

def obtener_nuevo_avocado()-> pd.DataFrame:
    """
    Función que devuelve dataframe con columna region_type

    Regresa:
    - pd.DataFrame: DataFrame con columna nueva que agrupa regiones para análisis
    """

    nuevo_avocado_df = map_regions(dataset_avocado_original_df, region_classification, guardar=False)
    return nuevo_avocado_df

def imputar_fechas()-> pd.DataFrame:
    """
    Función que imouta por promedio las tres entradas faltantes en el avocado.csv original

    regresa:
    - pd.DataFrame: con las tres fechas para el type organic en WestTexNewMexico
    """

    df = obtener_nuevo_avocado()
    df['Date'] = pd.to_datetime(df['Date'])
    # Fechas y parámetros específicos para imputación
    missing_dates = ['2015-12-06', '2017-06-18', '2017-06-25']
    region = 'WestTexNewMexico'
    avocado_type = 'organic'

    # Iterar sobre las fechas faltantes para imputar valores
    for date in missing_dates:
        # Convertir la fecha a datetime
        date = pd.to_datetime(date)

        # Filtrar las filas previas y posteriores a la fecha faltante
        prev_row = df[(df['Date'] < date) &
                      (df['region'] == region) &
                      (df['type'] == avocado_type)].sort_values(by='Date').iloc[-1]
        next_row = df[(df['Date'] > date) &
                      (df['region'] == region) &
                      (df['type'] == avocado_type)].sort_values(by='Date').iloc[0]

        # Calcular el promedio de los valores numéricos entre las dos fechas
        imputed_values = prev_row.copy()
        for col in df.select_dtypes(include='number').columns:
            imputed_values[col] = (prev_row[col] + next_row[col]) / 2

        # Asignar la fecha, región y tipo específico a la fila imputada
        imputed_values['Date'] = date
        imputed_values['region'] = region
        imputed_values['type'] = avocado_type

        # Añadir la fila imputada al DataFrame
        df = pd.concat([df, pd.DataFrame([imputed_values])], ignore_index=True)

        # Ordenar el DataFrame por fecha para mantener el orden cronológico
        df = df.sort_values(by='Date').reset_index(drop=True)

    return df

In [2]:
# Cargar librerias
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import pandas as pd


df_orig = obtener_nuevo_avocado()

#df_orig = pd.read_csv("avocado_with_region_types.csv")

# guardamos el original y trabajamos con una copia
df = df_orig.copy()



In [7]:
import requests
import pandas as pd
from datetime import datetime

# Define the API and your API key
api_url = "http://api.openweathermap.org/data/2.5/onecall/timemachine"
api_key = "d3679ce88231385b5b6b479378f87095"

# Latitude and Longitude for a region like California
lat = 36.7783
lon = -119.4179

# Example for a range of years: 2015 to 2023
years = range(2015, 2024)

# Placeholder for weather data
weather_data = []

for year in years:
    for month in range(1, 13):  # Loop through months 1-12
        for day in range(1, 32):  # Loop through days (we'll filter out invalid dates later)
            try:
                # Create a date object
                date = datetime(year, month, day)
                timestamp = int(date.timestamp())
                
                # Make the request
                params = {
                    "lat": lat,
                    "lon": lon,
                    "dt": timestamp,
                    "appid": api_key
                }
                response = requests.get(api_url, params=params)
                if response.status_code == 200:
                    data = response.json()
                    weather_data.append({
                        "date": date,
                        "temperature": data['current']['temp'],
                        "humidity": data['current']['humidity'],
                        "rain": data['current'].get('rain', {}).get('1h', 0),
                    })
            except Exception as e:
                print(f"Skipping invalid date {year}-{month}-{day}: {e}")

# Convert the collected weather data into a DataFrame
weather_df = pd.DataFrame(weather_data)

# Display the weather data for the first few rows
print(weather_df.head())


Skipping invalid date 2015-2-29: day is out of range for month
Skipping invalid date 2015-2-30: day is out of range for month
Skipping invalid date 2015-2-31: day is out of range for month
Skipping invalid date 2015-4-31: day is out of range for month
Skipping invalid date 2015-6-31: day is out of range for month
Skipping invalid date 2015-9-31: day is out of range for month
Skipping invalid date 2015-11-31: day is out of range for month
Skipping invalid date 2016-2-30: day is out of range for month
Skipping invalid date 2016-2-31: day is out of range for month
Skipping invalid date 2016-4-31: day is out of range for month
Skipping invalid date 2016-6-31: day is out of range for month
Skipping invalid date 2016-9-31: day is out of range for month
Skipping invalid date 2016-11-31: day is out of range for month
Skipping invalid date 2017-2-29: day is out of range for month
Skipping invalid date 2017-2-30: day is out of range for month
Skipping invalid date 2017-2-31: day is out of range 

In [11]:
import requests
from datetime import datetime

# Replace with your API key and coordinates
api_key = "2e5a5a74e6b1270a7c7233dbea1f67ec"
lat = 36.7783  # Example: latitude for California
lon = -119.4179  # Example: longitude for California

# Request weather data for a specific day (timestamp for testing)
timestamp = int(datetime(2020, 1, 1).timestamp())  # Example date: January 1, 2020

# API URL
api_url = "http://api.openweathermap.org/data/2.5/onecall/timemachine"

# Parameters for the API request
params = {
    "lat": lat,
    "lon": lon,
    "dt": timestamp,
    "appid": api_key
}

# Make the API request
response = requests.get(api_url, params=params)

# Check if the response is successful (status code 200)
if response.status_code == 200:
    data = response.json()
    print("API Response:", data)  # Check the contents of the response
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")


Failed to fetch data. Status code: 401


In [9]:
# Assuming 'weather_df' is your weather DataFrame

# Specify the file path where you want to save the CSV file
file_path = 'data/weather_data.csv'

# Write the DataFrame to a CSV file
weather_df.to_csv(file_path, index=False)

# Confirmation message
print(f"Weather data has been saved to {file_path}")
weather_df.info()


Weather data has been saved to data/weather_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [16]:
#import os
#!pip3 install pytrends

import time
from pytrends.request import TrendReq

# Initialize pytrends object
pytrends = TrendReq(hl='en-US', tz=360)

# Define keywords
keywords = ['avocados']

# Get interest over time data with time delays
pytrends.build_payload(keywords, cat=0, timeframe='2015-01-01 2023-12-31', geo='US', gprop='')

# Use a delay between requests to avoid hitting the rate limit
time.sleep(5)  # Add a delay of 5 seconds between requests

# Retrieve the data
interest_over_time_df = pytrends.interest_over_time()

# Save the data to a CSV file
interest_over_time_df.to_csv('avocado_trends.csv')



TooManyRequestsError: The request failed: Google returned a response with code 429

In [6]:
# Assuming you have loaded your avocado dataset into df2
#df_avocados = pd.read_csv("avocados_kaggle.csv")
df_avocados=df.copy()

# Merge weather data (based on date or region)
df_enriched = pd.merge(df_avocados, weather_df, left_on="Date", right_on="date", how="left")


# Now df_enriched contains avocado data along with exogenous variables
print(df_enriched.head())


KeyError: 'date'

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df_enriched is the dataset with avocado data and weather data

# Plotting price vs. temperature, humidity, and rain
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Price vs Temperature
sns.scatterplot(x=df_enriched['temperature'], y=df_enriched['AveragePrice'], ax=axes[0])
axes[0].set_title('Price vs Temperature')

# Price vs Humidity
sns.scatterplot(x=df_enriched['humidity'], y=df_enriched['AveragePrice'], ax=axes[1])
axes[1].set_title('Price vs Humidity')

# Price vs Rain
sns.scatterplot(x=df_enriched['rain'], y=df_enriched['AveragePrice'], ax=axes[2])
axes[2].set_title('Price vs Rain')

plt.tight_layout()
plt.show()

# Plotting volume vs. weather variables
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Volume vs Temperature
sns.scatterplot(x=df_enriched['temperature'], y=df_enriched['TotalVolume'], ax=axes[0])
axes[0].set_title('Volume vs Temperature')

# Volume vs Humidity
sns.scatterplot(x=df_enriched['humidity'], y=df_enriched['TotalVolume'], ax=axes[1])
axes[1].set_title('Volume vs Humidity')

# Volume vs Rain
sns.scatterplot(x=df_enriched['rain'], y=df_enriched['TotalVolume'], ax=axes[2])
axes[2].set_title('Volume vs Rain')

plt.tight_layout()
plt.show()
