In [8]:
import pandas as pd
import numpy as np

# List of CSV files provided by the user
csv_files = [
    'data/glass.csv',
    'data/new-thyroid.csv',
    'data/pima-indians-diabetes.csv',
     'data/pollution.csv'
]

# Dictionary to store DataFrames
dataframes = {}

for file_name in csv_files:
    try:
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_name)
        dataframes[file_name] = df

        print(f"\n--- Exploratory Data Analysis for {file_name} ---")

        # Display the first 5 rows
        print("\nHead of the DataFrame:")
        print(df.head())

        # Display column information (data types, non-null counts)
        print("\nDataFrame Info:")
        df.info()

        # Display descriptive statistics
        print("\nDescriptive Statistics:")
        print(df.describe())

    except Exception as e:
        print(f"Error loading or processing {file_name}: {e}")


--- Exploratory Data Analysis for data/glass.csv ---

Head of the DataFrame:
   1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.00  0.00.1  1
0  1.51761  13.89  3.60  1.36  72.73  0.48  7.83   0.0    0.00  1
1  1.51618  13.53  3.55  1.54  72.99  0.39  7.78   0.0    0.00  1
2  1.51766  13.21  3.69  1.29  72.61  0.57  8.22   0.0    0.00  1
3  1.51742  13.27  3.62  1.24  73.08  0.55  8.07   0.0    0.00  1
4  1.51596  12.79  3.61  1.62  72.97  0.64  8.07   0.0    0.26  1

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   1.52101  213 non-null    float64
 1   13.64    213 non-null    float64
 2   4.49     213 non-null    float64
 3   1.10     213 non-null    float64
 4   71.78    213 non-null    float64
 5   0.06     213 non-null    float64
 6   8.75     213 non-null    float64
 7   0.00     213 non-null    float64
 8   0.00.1   213 non-nu

In [7]:
# Load the pollution dataset
pollution_df = pd.read_csv('data/pollution.csv')

print("--- Feature Engineering for pollution.csv ---")

# 1. Handle missing pm2.5 values: Impute with the mean
# It's important to do this before creating new features that might depend on pm2.5,
# or if pm2.5 itself is a target or feature.
pollution_df['pm2.5'] = pollution_df['pm2.5'].fillna(pollution_df['pm2.5'].mean())
print(f"\nMissing values in 'pm2.5' after imputation: {pollution_df['pm2.5'].isnull().sum()}")

# 2. Combine date/time columns into a single datetime column
pollution_df['datetime'] = pd.to_datetime(pollution_df[['year', 'month', 'day', 'hour']])

# 3. Extract new features from the datetime column
pollution_df['day_of_week'] = pollution_df['datetime'].dt.dayofweek # Monday=0, Sunday=6
pollution_df['day_of_year'] = pollution_df['datetime'].dt.dayofyear
pollution_df['week_of_year'] = pollution_df['datetime'].dt.isocalendar().week.astype(int) # Using isocalendar for week number
pollution_df['quarter'] = pollution_df['datetime'].dt.quarter
pollution_df['is_weekend'] = (pollution_df['day_of_week'] >= 5).astype(int) # 1 for weekend, 0 for weekday
pollution_df['hour_sin'] = np.sin(2 * np.pi * pollution_df['hour']/23.0) # Sine transformation for cyclical hour
pollution_df['hour_cos'] = np.cos(2 * np.pi * pollution_df['hour']/23.0) # Cosine transformation for cyclical hour

# You might want to drop the original year, month, day, hour columns if 'datetime' replaces them entirely
# pollution_df = pollution_df.drop(columns=['year', 'month', 'day', 'hour'])

# Display the head and info of the DataFrame with new features
print("\nDataFrame head with new features:")
print(pollution_df.head())

print("\nDataFrame info with new features:")
pollution_df.info()

# Save the transformed dataframe for potential future use (optional)
# pollution_df.to_csv('pollution_engineered.csv', index=False)

--- Feature Engineering for pollution.csv ---

Missing values in 'pm2.5' after imputation: 0

DataFrame head with new features:
   No  year  month  day  hour      pm2.5  DEWP  TEMP    PRES cbwd  ...  Is  \
0   1  2010      1    1     0  98.613215   -21 -11.0  1021.0   NW  ...   0   
1   2  2010      1    1     1  98.613215   -21 -12.0  1020.0   NW  ...   0   
2   3  2010      1    1     2  98.613215   -21 -11.0  1019.0   NW  ...   0   
3   4  2010      1    1     3  98.613215   -21 -14.0  1019.0   NW  ...   0   
4   5  2010      1    1     4  98.613215   -20 -12.0  1018.0   NW  ...   0   

   Ir            datetime day_of_week  day_of_year  week_of_year  quarter  \
0   0 2010-01-01 00:00:00           4            1            53        1   
1   0 2010-01-01 01:00:00           4            1            53        1   
2   0 2010-01-01 02:00:00           4            1            53        1   
3   0 2010-01-01 03:00:00           4            1            53        1   
4   0 2010-01-01 0