In [2]:
import gzip
import pandas as pd
import os
import tarfile
import re

## Combining all flight_data .csv files into one, 'flight_data.csv'

In [4]:
# Path to the folder containing CSV files
folder_path = r'C:\Users\hopeh\Desktop\DS_Bootcamp\Flight_times_project\flight_data'

# List to hold individual DataFrames
dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Read each CSV file and append to the list
        df = pd.read_csv(file_path, low_memory=False)
        dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('flight_data.csv', index=False)

In [7]:
# Load the CSV file into a DataFrame
df = pd.read_csv('flight_data.csv', low_memory=False)

# Get the number of rows and columns
num_rows, num_columns = df.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 13662099
Number of columns: 44


## Read weather data file ghcnd_all.tar.gz

In [3]:
# # Read the gzip-compressed CSV file into a DataFrame
# gzip_file_path = r'C:\Users\hopeh\Desktop\DS_Bootcamp\Flight_times_project\weather_data\2024.csv.gz'
# df = pd.read_csv(gzip_file_path, compression='gzip')

# # Display the first few rows of the DataFrame
# print(df.head(20))

    AE000041196  20240101  TMAX  278 Unnamed: 4 Unnamed: 5  S  Unnamed: 7
0   AE000041196  20240101  TMIN  182        NaN        NaN  S         NaN
1   AE000041196  20240101  PRCP    0          D        NaN  S         NaN
2   AE000041196  20240101  TAVG  236          H        NaN  S         NaN
3   AEM00041194  20240101  TMAX  277        NaN        NaN  S         NaN
4   AEM00041194  20240101  TMIN  208        NaN        NaN  S         NaN
5   AEM00041194  20240101  PRCP    0        NaN        NaN  S         NaN
6   AEM00041194  20240101  TAVG  246          H        NaN  S         NaN
7   AEM00041217  20240101  TMAX  271        NaN        NaN  S         NaN
8   AEM00041217  20240101  TMIN  206        NaN        NaN  S         NaN
9   AEM00041217  20240101  TAVG  238          H        NaN  S         NaN
10  AEM00041218  20240101  TMAX  275        NaN        NaN  S         NaN
11  AEM00041218  20240101  TMIN  179        NaN        NaN  S         NaN
12  AEM00041218  20240101  TAVG  221  

In [11]:

# Path to weather data file: ghcnd_all.tar.gz
file_path = r'C:\Users\hopeh\Desktop\DS_Bootcamp\Flight_times_project\weather_data\ghcnd_all.tar.gz'
extract_path = r'C:\Users\hopeh\Desktop\DS_Bootcamp\Flight_times_project\weather_data\ghcnd_all'

# Extract the tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path=extract_path)

print(f"Files extracted to {extract_path}")


Files extracted to C:\Users\hopeh\Desktop\DS_Bootcamp\Flight_times_project\weather_data\ghcnd_all


Convert weather data to .csv file

In [None]:
import pandas as pd
import os

data_dir = "C:/Users/hopeh/Desktop/DS_Bootcamp/Flight_times_project/weather_data/ghcnd_all/ghcnd_all"

start_date = pd.to_datetime('2022-05-01')
end_date = pd.to_datetime('2024-04-30')

# List all .dly files in the directory
all_dly_files = [f for f in os.listdir(data_dir) if f.endswith('.dly')]

# Define column names
colnames = ["ID", "YEAR", "MONTH"] + [f"VALUE{i+1}" for i in range(31)] + \
           [f"MFLAG{i+1}" for i in range(31)] + [f"QFLAG{i+1}" for i in range(31)] + \
           [f"SFLAG{i+1}" for i in range(31)]

# Define column widths based on fixed-width format
col_widths = [11, 4, 2] + [8] * 31 * 4  # Adjust widths as necessary

def process_file(file_path):
    print(f"Processing file: {file_path}")
    results = []
    
    try:
        data = pd.read_fwf(file_path, widths=col_widths, header=None, skip_blank_lines=True)
        data.columns = colnames
    except pd.errors.ParserError as e:
        print(f"ParserError: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if there's a parsing error

    print(f"File {file_path} loaded, processing rows...")
    for _, row in data.iterrows():
        station_id = row['ID']
        year = row['YEAR']
        month = row['MONTH']

        for day in range(1, 32):
            value_col = f"VALUE{day}"
            mflag_col = f"MFLAG{day}"
            
            value = row[value_col]
            mflag = row[mflag_col]
            
            if pd.notna(value):
                date_str = f"{year}-{month:02d}-{day:02d}"
                
                try:
                    # Attempt to parse the date
                    date = pd.to_datetime(date_str, format='mixed', errors='raise')
                    
                    if start_date <= date <= end_date:
                        result = {
                            'STATION': station_id,
                            'DATE': date.strftime('%Y-%m-%d'),
                            'TMAX': value if mflag == "TMAX" else None,
                            'TMIN': value if mflag == "TMIN" else None,
                            'TAVG': value if mflag == "TAVG" else None,
                            'PRCP': value if mflag == "PRCP" else None,
                            'SNOW': value if mflag == "SNOW" else None,
                            'AWND': value if mflag == "AWND" else None
                        }
                        results.append(result)
                except ValueError as e:
                    print(f"DateParseError: {e} for date {date_str}")
                    continue  # Skip this date if it's invalid
    if len(results) % 1000 == 0:
            print(f"Processed {len(results)} rows...")
    return pd.DataFrame(results)

# Process all .dly files and collect data
all_data = []
for file in all_dly_files:
    df = process_file(os.path.join(data_dir, file))
    if not df.empty:
        all_data.append(df)

if all_data:
    all_data_combined = pd.concat(all_data, ignore_index=True)
    all_data_combined.to_csv('ghcn_daily_filtered.csv', index=False)
else:
    print("No data to save.")
