In [1]:
import pandas as pd
import os
import numpy as np

Combine all files in weather_data folder into one .csv file titled combined_weather.csv

In [3]:

# Define the folder containing the CSV files and the output file path
input_folder = r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\weather_data'
output_file = r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\combined_weather.csv'


# List to hold dataframes
dataframes = []

# Iterate over all files in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):
        file_path = os.path.join(input_folder, file_name)
        # Read the CSV file
        df = pd.read_csv(file_path, low_memory=False)
        # Add an ID column with the filename as the ID
        df['ID'] = file_name
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all dataframes
combined_df = pd.concat(dataframes, ignore_index=True, sort=False)

# Save the combined dataframe to a new CSV file
combined_df.to_csv(output_file, index=False)


In [4]:
# Define the path to the combined CSV file
combined_weather_path = r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\combined_weather.csv'

# Read the CSV file into a DataFrame
weather_df = pd.read_csv(combined_weather_path, low_memory=False)

# Display the first few rows of the DataFrame
print(weather_df.head())

ParserError: Error tokenizing data. C error: out of memory

In [None]:
# Display the column names
print(weather_df.columns.tolist())

In [None]:
# Display information about the DataFrame
print(weather_df.info())

In [None]:
# Reduce memory usage

# Convert float64 to float32
float_cols = weather_df.select_dtypes(include=['float64']).columns
weather_df[float_cols] = weather_df[float_cols].astype('float32')


In [None]:
# Check for duplicates

# Find all duplicate rows where all cells are identical
duplicate_rows = weather_df[weather_df.duplicated(keep=False)]

# Display the first 10 unique duplicate rows
unique_duplicate_rows = duplicate_rows.drop_duplicates()
print(unique_duplicate_rows.head(10))

# Count the number of unique duplicate rows
unique_duplicate_count = unique_duplicate_rows.shape[0]
print(f"Total unique duplicate rows: {unique_duplicate_count}")

In [None]:
# Reformat column titles

# Convert all column names to lowercase
weather_df.columns = weather_df.columns.str.lower()

# # Rename specific columns if needed
# weather_df.rename(columns={
#     'fl_date': 'date', 
#     'origin_city_name': 'origin_city',
#     'dest_city_name' : 'dest_city'
# }, inplace=True)

In [None]:
# Display column names
print(weather_df.columns)

In [None]:
# Check to see difference
print(weather_df.info())

In [None]:
# Check for missing values
print(weather_df.isnull().sum())

In [None]:
# # Define the missing value threshold to determine which attributes to remove (20%)
# threshold = 0.20

# # Calculate the proportion of missing values
# missing_proportion = combined_weather_df.isnull().mean()

# # Calculate the proportion of zero values
# zero_proportion = (combined_weather_df == 0).mean()

# # Filter columns where missing or zero values are more than the threshold
# high_missing_cols = missing_proportion[missing_proportion > threshold].index
# high_zero_cols = zero_proportion[zero_proportion > threshold].index

# # Print the results
# print("Columns with more than 20% missing values:")
# print(high_missing_cols)

# print("\nColumns with more than 20% zero values:")
# print(high_zero_cols)

# # Combine both lists to see columns with either high missing or zero values
# high_issue_cols = set(high_missing_cols).union(set(high_zero_cols))
# print("\nColumns with either high missing or zero values:")
# print(high_issue_cols)

In [None]:
# Calculate the percentage of NaN values for each column
nan_percentage = weather_df.isna().mean() * 100

# Adjust pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows

# Print the results
print(nan_percentage)


In [None]:
# Select the subset of columns to KEEP
columns_to_keep = ['station', 'id', 'name', 'latitude', 'longitude', 'elevation', 'date', 'prcp', 'snow', 'snwd', 'tmax', 'tmin', 'tobs']
weather_df = weather_df[columns_to_keep]


In [None]:
# # Drop Columns that provide no useful information for analysis (e.g., unique
# # identifiers if not needed)

# columns_to_drop = [
#     'origin_city_market_id',
#     'dest_city_market_id',
#     'origin',
#     'dest',
#     'origin_state_nm',
#     'dest_state_nm', 
#     'origin_airport_id',
#     'dest_airport_id'
# ]

# # Drop the identified columns
# flight_data = flight_data.drop(columns=columns_to_drop)

# # Verify the changes
# print(flight_data.info())

In [None]:
# Calculate the number of NaN values per row
num_nans_per_row = weather_df.isnull().sum(axis=1)

# Calculate the total number of columns in the DataFrame
total_columns = weather_df.shape[1]

# Calculate the proportion of NaN values per row
nan_proportion_per_row = num_nans_per_row / total_columns

In [None]:
# Define the threshold (62%)
threshold = 0.62

# Identify rows where the proportion of NaN values is more than the threshold
rows_with_high_nan = weather_df[nan_proportion_per_row > threshold]

# Save the rows with high NaN proportions to a new CSV file
rows_with_high_nan.to_csv(r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\rows_with_high_nan.csv', index=False)

In [None]:
# Display the number of rows with more than 62% NaN values
print(f"Number of rows with more than 62% NaN values: {len(rows_with_high_nan)}")

In [None]:
# Read the CSV file into a DataFrame
rows_w_high_nan_df = pd.read_csv(r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\rows_with_high_nan.csv', low_memory=False)

# Display the first few rows of the DataFrame
print(rows_w_high_nan_df.head())

In [None]:
# Display the column names
print(rows_w_high_nan_df.columns.tolist())

In [None]:
# Define the columns to check for NaN
columns_to_check = ['latitude', 'longitude', 'elevation', 'prcp', 'snow', 'snwd', 'tmax', 'tmin', 'tobs']

# Remove rows where all specified columns are NaN
weather_df = weather_df.dropna(subset=columns_to_check, how='all')

In [None]:
# Split the 'name' column into 'city' and 'state'
weather_df[['city', 'state']] = weather_df['name'].str.split(',', n=1, expand=True)

# Clean up the 'city' and 'state' columns
weather_df.loc[:, 'city'] = weather_df['city'].str.replace(r'\d+\.?\d*\s+[NSEW]+', '', regex=True).str.strip()
weather_df.loc[:, 'state'] = weather_df['state'].str.extract(r'([A-Z]{2})')[0]

# Display the updated DataFrame
print(weather_df.head())

In [None]:
# Check the shape of the cleaned DataFrame
print(f'DataFrame shape: {weather_df.shape}')


In [None]:
# Drop the name column
weather_df.drop(columns=['name'], inplace=True)

In [None]:
# Display the updated DataFrame
print(weather_df.head())

In [None]:
# Define the date range
start_date = '2023-04-30'
end_date = '2024-04-30'

# Filter the DataFrame
weather_df = weather_df[(weather_df['date'] >= start_date) & (weather_df['date'] <= end_date)]


In [None]:
print(weather_df.head())

In [None]:
# Convert all string values in the DataFrame to lowercase using apply
weather_df = weather_df.apply(lambda col: col.str.lower() if col.dtype == "object" else col)

# Check the updated DataFrame
print(weather_df.head())


In [None]:
# save the cleaned DataFrame to a new CSV file
weather_df.to_csv('cleaned_weather_data.csv', index=False)

In [None]:
# Display the column names
print(weather_df.columns)