#### Libraries

In [1]:
import os
import csv
from io import StringIO
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

# -- Settings --
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  from .autonotebook import tqdm as notebook_tqdm


#### Load Data

In [4]:
usa_launches = pd.read_csv('../data/transformed/launch/usa_launches.csv')
usa_launches.head()

Unnamed: 0,name,status,provider,rocket,mission,date,location,pad
0,Vanguard | Vanguard,Launch Failure,US Navy,Vanguard,Vanguard,1957-12-06 16:44:00,"Cape Canaveral, FL, USA",Launch Complex 18A
1,Juno-I | Explorer 1,Launch Successful,Army Ballistic Missile Agency,Juno-I,Explorer 1,1958-02-01 03:47:00,"Cape Canaveral, FL, USA",Launch Complex 26A
2,Vanguard | Vanguard,Launch Failure,US Navy,Vanguard,Vanguard,1958-02-05 07:33:00,"Cape Canaveral, FL, USA",Launch Complex 18A
3,Juno-I | Explorer 2,Launch Failure,Army Ballistic Missile Agency,Juno-I,Explorer 2,1958-03-05 18:27:00,"Cape Canaveral, FL, USA",Launch Complex 26A
4,Vanguard | Vanguard,Launch Successful,US Navy,Vanguard,Vanguard,1958-03-17 12:15:00,"Cape Canaveral, FL, USA",Launch Complex 18A


In [2]:
df = pd.read_csv('../data/transformed/weather/cape_canaveral_usa_hourly.csv',
                 parse_dates=['time'],
                 dtype={'weather_code (wmo code)': str})
df.head()

Unnamed: 0,time,temperature_2m (°F),relative_humidity_2m (%),dew_point_2m (°F),apparent_temperature (°F),precipitation (inch),rain (inch),snowfall (inch),snow_depth (ft),weather_code (wmo code),pressure_msl (hPa),surface_pressure (hPa),cloud_cover (%),cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),et0_fao_evapotranspiration (inch),vapour_pressure_deficit (kPa),wind_speed_10m (mp/h),wind_speed_100m (mp/h),wind_direction_10m (°),wind_direction_100m (°),wind_gusts_10m (mp/h),soil_temperature_0_to_7cm (°F),soil_temperature_7_to_28cm (°F),soil_temperature_28_to_100cm (°F),soil_temperature_100_to_255cm (°F),soil_moisture_0_to_7cm (m³/m³),soil_moisture_7_to_28cm (m³/m³),soil_moisture_28_to_100cm (m³/m³),soil_moisture_100_to_255cm (m³/m³),is_day (),sunshine_duration (s),shortwave_radiation (W/m²),direct_radiation (W/m²),diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),global_tilted_irradiance (W/m²),terrestrial_radiation (W/m²),shortwave_radiation_instant (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),global_tilted_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²)
0,1957-10-01 00:00:00,77.9,85.0,73.0,86.8,0.0,0.0,0.0,0.0,3,1013.7,1013.4,93.0,62.0,14.0,97.0,0.001,0.49,1.9,4.2,225.0,234.0,9.6,79.6,82.3,82.2,81.1,0.272,0.145,0.094,0.122,0,0.0,0.0,0.0,0.0,0.0,0.0,4.9,0.0,0.0,0.0,0.0,0.0,0.0
1,1957-10-01 01:00:00,78.1,84.0,72.9,85.8,0.035,0.035,0.0,0.0,53,1014.3,1014.0,88.0,45.0,29.0,100.0,0.0,0.52,4.4,5.6,221.0,230.0,13.0,78.9,82.0,82.2,81.1,0.269,0.147,0.094,0.122,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1957-10-01 02:00:00,76.2,88.0,72.3,82.7,0.031,0.031,0.0,0.0,53,1014.6,1014.3,51.0,8.0,24.0,99.0,0.0,0.37,6.6,8.4,225.0,228.0,13.2,78.2,81.8,82.2,81.1,0.281,0.15,0.094,0.122,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1957-10-01 03:00:00,75.6,89.0,72.1,82.4,0.02,0.02,0.0,0.0,53,1014.3,1014.0,52.0,13.0,17.0,100.0,0.0,0.33,5.4,7.2,218.0,219.0,14.3,77.7,81.5,82.2,81.1,0.303,0.153,0.094,0.122,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1957-10-01 04:00:00,75.1,89.0,71.6,81.4,0.016,0.016,0.0,0.0,51,1014.4,1014.1,42.0,5.0,13.0,99.0,0.0,0.33,6.0,8.1,195.0,194.0,14.5,77.4,81.2,82.2,81.1,0.298,0.16,0.094,0.122,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586248 entries, 0 to 586247
Data columns (total 45 columns):
 #   Column                                   Non-Null Count   Dtype         
---  ------                                   --------------   -----         
 0   time                                     586248 non-null  datetime64[ns]
 1   temperature_2m (°F)                      586201 non-null  float64       
 2   relative_humidity_2m (%)                 586201 non-null  float64       
 3   dew_point_2m (°F)                        586201 non-null  float64       
 4   apparent_temperature (°F)                586201 non-null  float64       
 5   precipitation (inch)                     586201 non-null  float64       
 6   rain (inch)                              586201 non-null  float64       
 7   snowfall (inch)                          586201 non-null  float64       
 8   snow_depth (ft)                          585624 non-null  float64       
 9   weather_code (wmo code)   

In [4]:
df.describe()

Unnamed: 0,time,temperature_2m (°F),relative_humidity_2m (%),dew_point_2m (°F),apparent_temperature (°F),precipitation (inch),rain (inch),snowfall (inch),snow_depth (ft),pressure_msl (hPa),surface_pressure (hPa),cloud_cover (%),cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),et0_fao_evapotranspiration (inch),vapour_pressure_deficit (kPa),wind_speed_10m (mp/h),wind_speed_100m (mp/h),wind_direction_10m (°),wind_direction_100m (°),wind_gusts_10m (mp/h),soil_temperature_0_to_7cm (°F),soil_temperature_7_to_28cm (°F),soil_temperature_28_to_100cm (°F),soil_temperature_100_to_255cm (°F),soil_moisture_0_to_7cm (m³/m³),soil_moisture_7_to_28cm (m³/m³),soil_moisture_28_to_100cm (m³/m³),soil_moisture_100_to_255cm (m³/m³),is_day (),sunshine_duration (s),shortwave_radiation (W/m²),direct_radiation (W/m²),diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),global_tilted_irradiance (W/m²),terrestrial_radiation (W/m²),shortwave_radiation_instant (W/m²),direct_radiation_instant (W/m²),diffuse_radiation_instant (W/m²),direct_normal_irradiance_instant (W/m²),global_tilted_irradiance_instant (W/m²),terrestrial_radiation_instant (W/m²)
count,586248,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,585624.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586248.0,586201.0,586201.0,586201.0,586201.0,586201.0,586201.0,586248.0,586201.0,586201.0,586201.0,586201.0,586201.0,586248.0
mean,1991-03-10 11:18:57.094199424,73.127858,75.30254,64.511153,75.349972,0.004316,0.004316,1.415897e-07,0.0,1017.429233,1017.073339,35.625642,18.683006,15.532326,37.060822,0.006231,0.682831,9.558863,12.744797,170.515664,170.453757,16.045744,76.554109,76.285698,76.032533,75.827106,0.120117,0.137593,0.127598,0.135617,0.413912,1488.764139,202.738506,138.765222,63.973284,219.207987,202.738506,373.219083,202.464822,138.758894,63.706026,217.431345,201.731276,371.935353
min,1957-10-01 00:00:00,29.7,14.0,12.6,15.1,0.0,0.0,0.0,0.0,979.3,979.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9,37.8,45.1,56.8,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1974-06-20 17:45:00,68.7,67.0,58.6,67.8,0.0,0.0,0.0,0.0,1014.8,1014.4,10.0,0.0,0.0,0.0,0.001,0.43,6.3,8.6,84.0,84.0,11.2,69.2,70.4,71.1,72.4,0.062,0.093,0.087,0.113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1991-03-10 11:30:00,74.6,78.0,67.1,76.8,0.0,0.0,0.0,0.0,1017.4,1017.0,30.0,8.0,5.0,15.0,0.003,0.63,9.1,12.3,156.0,156.0,15.0,76.7,77.1,76.5,76.1,0.099,0.128,0.119,0.132,0.0,0.0,9.0,1.0,7.0,11.0,9.0,61.7,0.0,0.0,0.0,0.0,0.0,8.1
75%,2007-11-28 05:15:00,79.1,85.0,73.2,85.5,0.0,0.0,0.0,0.0,1020.1,1019.7,53.0,24.0,21.0,85.0,0.01,0.88,12.3,16.3,261.0,260.0,19.9,82.6,82.8,81.6,79.7,0.166,0.173,0.162,0.159,1.0,3600.0,395.0,237.0,122.0,454.7,395.0,786.4,392.6,237.4,123.3,453.7,392.6,804.3
max,2024-08-16 23:00:00,95.3,100.0,82.9,108.4,1.28,1.28,0.055,0.0,1037.0,1036.6,100.0,100.0,100.0,100.0,0.033,4.08,57.2,82.5,360.0,360.0,102.7,127.8,102.5,93.8,86.6,0.403,0.403,0.39,0.257,1.0,3600.0,1040.0,918.0,451.0,991.6,1040.0,1318.0,1038.1,915.6,460.5,991.6,1038.1,1318.3
std,,8.009339,12.447233,10.530224,12.723568,0.019992,0.019992,8.06087e-05,0.0,4.282804,4.263664,29.82085,25.899392,23.08235,40.780586,0.006636,0.340434,4.472709,5.911476,103.964873,103.513666,6.733969,11.079913,8.066305,6.16835,4.229628,0.070571,0.056964,0.051732,0.038761,0.492533,1724.946541,274.933526,211.371133,81.755597,284.191434,274.933526,453.660307,276.76436,212.531633,82.234821,285.115,277.127196,456.552031


#### Data Cleaning

In [None]:
def split_dataset(input_file, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get the base name of the input file (without extension)
    base_name = os.path.splitext(os.path.basename(input_file))[0]

    # Read the entire input file, skipping the first two rows
    with open(input_file, 'r') as file:
        # Skip the first two lines
        for _ in range(2):
            next(file)
        
        # Read the rest of the file
        data = file.read()

    # Split the data and handle potential issues
    parts = data.split('\n\n')
    
    if len(parts) < 2:
        print(f"Error: Could not split the file into hourly and daily parts.")
        print(f"The file contains {len(parts)} part(s) separated by double newlines.")
        print("File structure (after skipping first two rows):")
        for i, part in enumerate(parts, 1):
            print(f"Part {i}:")
            print(part[:200] + "..." if len(part) > 200 else part)
            print()
        return

    hourly_data = parts[0]
    daily_data = '\n\n'.join(parts[1:])  # Join all remaining parts for daily data

    # Function to write data to a CSV file
    def write_to_csv(data, filename):
        with open(filename, 'w', newline='') as f:
            writer = csv.writer(f)
            for row in csv.reader(StringIO(data)):
                writer.writerow(row)

    # Write hourly data to a CSV file
    hourly_filename = os.path.join(output_folder, f"{base_name}_hourly.csv")
    write_to_csv(hourly_data, hourly_filename)

    # Write daily data to a CSV file
    daily_filename = os.path.join(output_folder, f"{base_name}_daily.csv")
    write_to_csv(daily_data, daily_filename)

    print(f"Dataset has been split into:")
    print(f"1. '{hourly_filename}'")
    print(f"2. '{daily_filename}'")

# -- Usage --
input_file = '../data/raw/weather/cape_canaveral_usa.csv'
output_folder = '../data/transformed/weather'  # Specify your desired output folder here

# Split the dataset
# split_dataset(input_file, output_folder)

In [None]:
def process_folder(input_folder, output_folder):
    # Ensure input folder path is absolute
    input_folder = os.path.abspath(input_folder)

    # Process each CSV file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            input_file = os.path.join(input_folder, filename)
            split_dataset(input_file, output_folder)

# -- Usage --
input_folder = '../data/raw/weather'
output_folder = '../data/transformed/weather'

# Process all CSV files in the input folder
# process_folder(input_folder, output_folder)

In [4]:
def convert_to_float(df, exclude_columns=None):
    """
    Convert all columns in a DataFrame to float, excluding specified columns.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame
    exclude_columns (list): List of column names to exclude from conversion
    
    Returns:
    pandas.DataFrame: DataFrame with columns converted to float
    """
    if exclude_columns is None:
        exclude_columns = []
    
    for column in df.columns:
        if column not in exclude_columns:
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    return df

# -- Usage --
df = convert_to_float(df, exclude_columns=['time', 'weather_code (wmo code)'])

In [None]:
def convert_time_to_datetime(df):
    """
    Convert the 'time' column in the DataFrame to datetime format.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame with a 'time' column
    
    Returns:
    pandas.DataFrame: DataFrame with 'time' column converted to datetime
    """
    df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%dT%H:%M', errors='coerce')
    
    # Check for any rows where conversion failed
    failed_rows = df[df['time'].isnull()]
    if not failed_rows.empty:
        print(f"Warning: {len(failed_rows)} rows failed to convert. First few problematic values:")
        print(failed_rows['time'].head().to_string())
    
    return df

# Convert the 'time' column to datetime
df = convert_time_to_datetime(df)

# Display the 'time' column to verify the conversion
df['time']