In [2]:
# --- Part 1: Import Libraries and Load Data ---
import pandas as pd
from sklearn.linear_model import LinearRegression

print("✅ Libraries imported successfully!")

try:
    # Use the file with artificially missing data
    df = pd.read_csv('dataset_with_missing_emissions.csv')
    print(f"✅ Dataset loaded successfully with {len(df)} rows.")
except FileNotFoundError:
    print("❌ Error: Make sure 'dataset_with_missing_emissions.csv' is in your CarbonProject folder.")
    exit()


# --- Part 2: Prepare Data by Separating Known and Unknown Emissions ---

# 💡 NEW LOGIC: Separate the dataframe into two parts.
# One part for training (where we know the emissions)
train_df = df.dropna(subset=['CarbonEmissions']).copy()

# Another part for predicting (where emissions are missing)
predict_df = df[df['CarbonEmissions'].isnull()].copy()

print(f"Separated data: {len(train_df)} rows for training, {len(predict_df)} rows for prediction.")


# --- Part 3: Prepare and Train the Model ---

# One-Hot Encode the 'Industry' column for the training data
train_df_processed = pd.get_dummies(train_df, columns=['Industry'], drop_first=True)

# Define our features (X) and the target we want to predict (y)
features = ['Revenue'] + [col for col in train_df_processed.columns if 'Industry_' in col]
target = 'CarbonEmissions'

# Create the training sets X and y
X_train = train_df_processed[features]
y_train = train_df_processed[target]

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("\n✅ Model training complete!")


# --- Part 4: Prepare Prediction Data and Predict Missing Values ---

# One-Hot Encode the prediction data just like we did for the training data
predict_df_processed = pd.get_dummies(predict_df, columns=['Industry'], drop_first=True)

# Align the columns of the prediction data with the training data.
# This ensures both have the exact same 'Industry' columns.
X_predict_aligned, _ = predict_df_processed.align(X_train, join='right', axis=1, fill_value=0)
X_predict_aligned = X_predict_aligned[features] # Ensure column order is the same

# Use our trained model to PREDICT the emissions
predicted_emissions = model.predict(X_predict_aligned)

print(f"✅ Predicted emissions for {len(predicted_emissions)} companies.")


# --- Part 5: Fill in the Missing Values ---

# Put the predictions back into our prediction dataframe
predict_df['CarbonEmissions'] = predicted_emissions

# Combine the original training data and the now-completed prediction data
final_df = pd.concat([train_df, predict_df])

print("✅ Missing values have been filled with predictions.")


# --- Part 6: Verify the Result ---
missing_after_fill = final_df['CarbonEmissions'].isnull().sum()

print("\n--- Verification ---")
if missing_after_fill == 0:
    print("🎉 Success! There are now 0 missing carbon emission values.")
else:
    print(f"⚠️ Something went wrong. There are still {missing_after_fill} missing values.")

# Save our completed dataset to a new file for Week 4
final_df.to_csv('companies_with_full_emissions_data.csv', index=False)
print("✅ Final, completed dataset saved to 'companies_with_full_emissions_data.csv'")

✅ Libraries imported successfully!
✅ Dataset loaded successfully with 11000 rows.
Separated data: 7700 rows for training, 3300 rows for prediction.

✅ Model training complete!
✅ Predicted emissions for 3300 companies.
✅ Missing values have been filled with predictions.

--- Verification ---
🎉 Success! There are now 0 missing carbon emission values.
✅ Final, completed dataset saved to 'companies_with_full_emissions_data.csv'
