In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import PoissonRegressor


In [None]:
train = pd.read_csv("data_given\train.csv") # read-in training dataset
validation = pd.read_csv("data_given\validation.csv") # read-in validation dataset

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the training and validation datasets
train_data = pd.read_csv("data_given/train.csv")
validation_data = pd.read_csv("data_given/validation.csv")

# Preprocessing: Drop irrelevant columns and separate features and target
irrelevant_columns = ["Unnamed: 0", "id_policy", "claim_amount"]
categorical_columns = ["pol_pay_freq", "pol_usage", "drv_sex1", "vh_fuel", "vh_type"]

# Features and target for training data
X_train = train_data.drop(columns=irrelevant_columns)
y_train = train_data["claim_amount"]

# Features for validation data
X_validation = validation_data.drop(columns=irrelevant_columns)
y_validation = validation_data["claim_amount"]

# Convert categorical columns to numeric using one-hot encoding
X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_validation = pd.get_dummies(X_validation, columns=categorical_columns, drop_first=True)

# Align the training and validation datasets (ensure matching feature columns)
X_validation = X_validation.reindex(columns=X_train.columns, fill_value=0)

# Convert binary columns like "Yes"/"No" to numeric
binary_columns = ["pol_payd"]
for col in binary_columns:
    X_train[col] = X_train[col].map({"No": 0, "Yes": 1})
    X_validation[col] = X_validation[col].map({"No": 0, "Yes": 1})

# Convert the target to numeric (ensuring it's suitable for Poisson regression)
y_train = pd.to_numeric(y_train, errors='coerce')
y_validation = pd.to_numeric(y_validation, errors='coerce')

# Initialize the Poisson Regressor
poisson_model = PoissonRegressor(alpha=0.5, max_iter=1000)

# Train the model
poisson_model.fit(X_train, y_train)

# Predict on the validation data
y_pred = poisson_model.predict(X_validation)

# Evaluate the model
mae = mean_absolute_error(y_validation, y_pred)
mse = mean_squared_error(y_validation, y_pred)

# Display results
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")

In [None]:
import numpy as np
import pandas as pd

# Load the dataset
synthetic_single_poisson = pd.read_csv("synthetic_data_poisson.csv")

# Number of new variables to create
num_new_var = 5

# Generate a random vector for multiplication
np.random.seed(42)  # For reproducibility
random_vector = np.random.rand(num_new_var)

# Create new features by multiplying 'X' with each value in the random vector
for i in range(num_new_var):
    synthetic_single_poisson[f'X_{i+1}'] = synthetic_single_poisson['X'] * random_vector[i]

# Save the updated dataset to a new CSV file
output_file_path = "multivariable_poisson_data.csv"
synthetic_single_poisson.to_csv(output_file_path, index=False)

print(f"Multivariable dataset saved to {output_file_path}")


Multivariable dataset saved to multivariable_poisson_data.csv


In [None]:
#updated poisson scaling dataset
import numpy as np
import pandas as pd

# Load the dataset
input_file_path = "synthetic_data_poisson.csv"  # Update with your actual file path
output_file_path = "multivariable_scaled_poisson_data.csv"  # Output file name
synthetic_single_poisson = pd.read_csv(input_file_path)

# Number of new variables to create
num_new_var = 5

# Generate a random vector of integers for scaling
np.random.seed(42)  # For reproducibility
random_integers = np.random.randint(1, 10, size=num_new_var)  # Scale factors between 1 and 10

# Display the random integers for scaling
print("Random integers used for scaling:", random_integers)

# Create new features by scaling 'X' with each random integer
for i, scale_factor in enumerate(random_integers, start=1):
    # Scale 'X' and keep the result Poisson-distributed
    synthetic_single_poisson[f'X_{i}'] = np.random.poisson(
        lam=synthetic_single_poisson['X'] * scale_factor
    )

# Save the updated dataset to a new CSV file
synthetic_single_poisson.to_csv(output_file_path, index=False)

print(f"Scaled multivariable dataset saved to {output_file_path}")


Random integers used for scaling: [7 4 8 5 7]
Scaled multivariable dataset saved to multivariable_scaled_poisson_data.csv
