In [3]:
import pandas as pd

# Load data
df = pd.read_csv("greenhouse_gas.csv")

# Drop unnamed column if present
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Handle missing values (if any)
df = df.dropna()

# Encode categorical columns
df_encoded = pd.get_dummies(df, columns=["Industry Name", "Substance"], drop_first=True)

# Features & Target
features = df_encoded.drop("Supply Chain Emission Factors with Margins", axis=1)
target = df_encoded["Supply Chain Emission Factors with Margins"]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Load the data
df = pd.read_csv("C:/Users/84T g4/anaconda_projects/greenhouse_gas.csv")

# Drop unnamed columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Drop rows with missing target
df = df.dropna(subset=["Supply Chain Emission Factors with Margins"])

# Define features and target
X = df[["Industry Code", "Industry Name", "Substance"]]
y = df["Supply Chain Emission Factors with Margins"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", ColumnTransformer(transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), ["Industry Code", "Industry Name", "Substance"])
    ])),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train
pipeline.fit(X_train, y_train)

# Predict
preds = pipeline.predict(X_test)

# Evaluate
from sklearn.metrics import r2_score, mean_squared_error
print("R² score:", r2_score(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))





R² score: 0.30212269647239076
MSE: 0.015234219803773577


In [6]:
import joblib

# Save the trained model and the feature transformer
joblib.dump(pipeline, "greenhouse_model.pkl")


['greenhouse_model.pkl']

In [7]:
import joblib
joblib.dump(pipeline, "greenhouse_model.pkl")


['greenhouse_model.pkl']

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import joblib

# Load your dataset
df = pd.read_csv("greenhouse_gas.csv")

# Drop unnamed columns if any
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Drop rows with missing values
df.dropna(inplace=True)

# Select features and target
features = ["Industry Code", "Industry Name", "Substance"]
target = "Supply Chain Emission Factors with Margins"

X = df[features]
y = df[target]

# Column transformer: encode categorical variables
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore"), features)
])

# Create pipeline
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train
model.fit(X, y)

# Save the model
joblib.dump(model, "greenhouse_model.pkl")
joblib.dump(features, "model_features.pkl")


['model_features.pkl']

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib

# Step 1: Load and clean the data
df = pd.read_csv("greenhouse_gas.csv")

# Drop unnamed columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Drop missing values
df = df.dropna()

# Step 2: Define features and target
X = df[["Industry Code", "Industry Name", "Substance"]]
y = df["Supply Chain Emission Factors with Margins"]

# Step 3: Preprocessing
categorical_cols = ["Industry Code", "Industry Name", "Substance"]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Step 4: Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 5: Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Step 6: Save model and columns
joblib.dump(pipeline, "greenhouse_model.pkl")
joblib.dump(pipeline.named_steps['preprocessor'].get_feature_names_out(), "model_features.pkl")

print("✅ Model trained and saved as 'greenhouse_model.pkl'")


✅ Model trained and saved as 'greenhouse_model.pkl'
