### Data Preparation: Handling Missing and Duplicate Data
# Answering Question: Data Preparation - Handle missing and duplicate data from the project document.

In [4]:
import pandas as pd

# Load Dataset
filepath = "data\Solar.csv"  # Adjust the path if needed

df = pd.read_csv(filepath)
print(f"Dataset Loaded. Shape: {df.shape}")

# Handling Missing Values
# Checking for missing data
missing_summary = df.isnull().sum()
print("Missing Values Summary:\n", missing_summary)

# Dropping columns with more than 50% missing values
threshold = 0.5 * len(df)
df = df.dropna(axis=1, thresh=threshold)

# Filling remaining missing values with the median
df = df.fillna(df.median(numeric_only=True))

# Removing Duplicates
duplicate_count = df.duplicated().sum()
print(f"Found {duplicate_count} duplicate records.")

df = df.drop_duplicates()

# Saving the cleaned dataset
df.to_csv("data\Solar.csv", index=False)
print("Data preprocessing completed and saved as cleaned_solar_pv_data.csv")


Dataset Loaded. Shape: (8760, 8)
Missing Values Summary:
 Date-Hour(NMT)         0
WindSpeed              0
Sunshine               0
AirPressure            0
Radiation              0
AirTemperature         0
RelativeAirHumidity    0
SystemProduction       0
dtype: int64
Found 0 duplicate records.
Data preprocessing completed and saved as cleaned_solar_pv_data.csv


### Seasonal Feature Extraction
# Answering Question: Use date to extract seasonal information (Season, Month, and Day of the month) from the project document.


In [5]:

# Assuming the dataset has a 'date' column
# Convert the date column to datetime format
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])

    # Extracting month and day
    df['month'] = df['date'].dt.month
    df['day_of_month'] = df['date'].dt.day

    # Defining seasons based on month
    def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'

    df['season'] = df['month'].apply(get_season)

    print("Seasonal features extracted successfully.")
else:
    print("No 'date' column found in the dataset.")

# Saving the dataset with seasonal features
df.to_csv("data\Solar.csv", index=False)
print("Seasonal features saved as seasonal_features_solar_pv_data.csv")


No 'date' column found in the dataset.
Seasonal features saved as seasonal_features_solar_pv_data.csv
