In [2]:

# STEP 1: Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Inline plotting and styling
%matplotlib inline
sns.set(style="whitegrid")


# STEP 2: Load CSV and Assign Headers

file_path = "sarp-mrg1_p3b_20240626_RA_20250709T180453.csv"
column_names = [
    'Time', 'Latitude', 'Longitude', 'Altitude',
    'Organics', 'Nitrates', 'Sulfates', 'Ammonium', 'Chloride'
]

try:
    aero_df = pd.read_csv(file_path, skiprows=46, header=None)
    aero_df = aero_df.iloc[:, :len(column_names)]
    aero_df.columns = column_names
    print(" CSV loaded successfully with custom headers.")
except Exception as e:
    raise RuntimeError(f" Failed to load or parse file: {e}")


# STEP 3: Preview Dataset

print("\n Preview Data:")
display(aero_df.head())
print("\n Dataset shape:", aero_df.shape)
print("\n Columns:", aero_df.columns.tolist())
aero_df.info()


# STEP 4: Clean Column Names & Duplicates

aero_df.columns = aero_df.columns.str.strip()
duplicate_count = aero_df.duplicated().sum()
print(f"\ Duplicate rows found: {duplicate_count}")
aero_df = aero_df.drop_duplicates()


# STEP 5: Handle Missing Values

print("\n Missing values per column:")
print(aero_df.isnull().sum())

aerosol_cols = ['Organics', 'Nitrates', 'Sulfates', 'Ammonium', 'Chloride']
aero_df = aero_df.dropna(subset=aerosol_cols, how='all')

for col in aerosol_cols:
    if aero_df[col].isnull().sum() > 0:
        median_val = aero_df[col].median()
        aero_df[col] = aero_df[col].fillna(median_val)


# STEP 6: Parse Time Column

try:
    aero_df['Time'] = pd.to_datetime(aero_df['Time'], errors='coerce')
except:
    print(" Time column could not be parsed.")


# STEP 7: Descriptive Stats

print("\n Summary Statistics:")
display(aero_df.describe())


# STEP 8: Correlation Matrix

plt.figure(figsize=(8, 6))
sns.heatmap(aero_df[aerosol_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Between Aerosol Components")
plt.tight_layout()
plt.show()


# STEP 9: Aerosol vs Altitude

for col in aerosol_cols:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x='Altitude', y=col, data=aero_df, alpha=0.6)
    plt.title(f"{col} Concentration vs Altitude")
    plt.xlabel("Altitude (m)")
    plt.ylabel(f"{col} Concentration")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


# STEP 10: Flight Path Colored by Organics

color_var = 'Organics'
plt.figure(figsize=(10, 6))
scatter = plt.scatter(
    x=aero_df['Longitude'],
    y=aero_df['Latitude'],
    c=aero_df[color_var],
    cmap='viridis',
    alpha=0.8
)
plt.colorbar(scatter, label=f'{color_var} Concentration')
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title(f"Flight Path Colored by {color_var}")
plt.grid(True)
plt.tight_layout()
plt.show()


# STEP 11: Time-Series Trend (if Time is valid)

if aero_df['Time'].notna().all():
    for col in aerosol_cols:
        plt.figure(figsize=(12, 4))
        plt.plot(aero_df['Time'], aero_df[col], label=col)
        plt.title(f"{col} Trend Over Time")
        plt.xlabel("Time")
        plt.ylabel(f"{col} Concentration")
        plt.grid(True)
        plt.tight_layout()
        plt.show()


# STEP 12: Boxplots for Outlier Detection

plt.figure(figsize=(12, 6))
sns.boxplot(data=aero_df[aerosol_cols])
plt.title("Boxplots of Aerosol Components")
plt.ylabel("Concentration")
plt.grid(True)
plt.tight_layout()
plt.show()


# STEP 13: Pairplot of Aerosol Variables

sns.pairplot(aero_df[aerosol_cols], corner=True)
plt.suptitle("Pairwise Relationships Between Aerosol Components", y=1.02)
plt.tight_layout()
plt.show()


# STEP 14: Save Cleaned Data

aero_df.to_csv("Aerosol_Cleaned_Output.csv", index=False)
print(" Cleaned data saved to 'Aerosol_Cleaned_Output.csv'")


# STEP 15: Display Final Frame (for Review)

print("\n Final cleaned dataset preview:")
print(aero_df.head())


RuntimeError:  Failed to load or parse file: [Errno 2] No such file or directory: 'sarp-mrg1_p3b_20240626_RA_20250709T180453.csv'