In [1]:
import pandas as pd
import numpy as np

# --- 1. Load the Clean Dataset ---
try:
    file_name = 'company_esg_financial_dataset.csv'
    df = pd.read_csv(file_name)
    print(f"✅ Original dataset '{file_name}' loaded successfully with {len(df)} rows.")
except FileNotFoundError:
    print(f"❌ Error: Make sure '{file_name}' is in your CarbonProject folder.")
    exit()


# --- 2. Artificially Remove Data ---

# We will randomly set 30% of the CarbonEmissions data to 'NaN' (Not a Number),
# which is Pandas' standard marker for missing data.
# The 'frac=0.30' means we are selecting 30% of the rows.
rows_to_remove = df.sample(frac=0.30, random_state=42).index

# Use .loc to select the specific rows and the 'CarbonEmissions' column, and set them to NaN
df.loc[rows_to_remove, 'CarbonEmissions'] = np.nan

print(f"✅ Removed emissions data from {len(rows_to_remove)} random rows.")


# --- 3. Save the New Dataset ---
new_file_name = 'dataset_with_missing_emissions.csv'
df.to_csv(new_file_name, index=False)

print(f"\n🎉 Success! A new file named '{new_file_name}' has been created.")
print("Please use THIS file for your Week 3 model building.")

# --- 4. Verification (Optional but Recommended) ---
print("\n--- Verifying the new file ---")
new_df = pd.read_csv(new_file_name)
missing_count = new_df['CarbonEmissions'].isnull().sum()
print(f"The new file now has {missing_count} missing emission values.")

✅ Original dataset 'company_esg_financial_dataset.csv' loaded successfully with 11000 rows.
✅ Removed emissions data from 3300 random rows.

🎉 Success! A new file named 'dataset_with_missing_emissions.csv' has been created.
Please use THIS file for your Week 3 model building.

--- Verifying the new file ---
The new file now has 3300 missing emission values.
