# data cleaning 

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load your data
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')

# Check for missing data
missing_data = data.isnull().sum()
print(f"Missing data per column:\n{missing_data}")

# Define a threshold for removing columns with too many missing values (e.g., more than 50%)
threshold = 0.5
columns_to_remove = missing_data[missing_data / len(data) > threshold].index
data_cleaned = data.drop(columns=columns_to_remove)

# For columns with less missing data, handle them differently:
# Impute numerical columns with the mean
numerical_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
numerical_imputer = SimpleImputer(strategy='mean')
data_cleaned[numerical_columns] = numerical_imputer.fit_transform(data_cleaned[numerical_columns])

# Impute categorical columns with the most frequent value (mode)
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
data_cleaned[categorical_columns] = categorical_imputer.fit_transform(data_cleaned[categorical_columns])

# Optional: Save the cleaned data
data_cleaned.to_csv('../assets/data/cleaned_dataset.csv', index=False)


  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')


Missing data per column:
UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
Num

# feature engineering 

In [3]:
import pandas as pd
# Load your data
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')
# Create new features
data['ClaimFrequency'] = data['TotalClaims'] / (data['TotalPremium'] + 1)  # Added 1 to avoid division by zero
data['LossRatio'] = data['TotalClaims'] / (data['TotalPremium'] + 1)  # Added 1 for safety
data['ClaimsToPremiumDiff'] = data['TotalPremium'] - data['TotalClaims']
# Display the new features
print(data[['ClaimFrequency', 'LossRatio', 'ClaimsToPremiumDiff']])


  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')


         ClaimFrequency  LossRatio  ClaimsToPremiumDiff
0                   0.0        0.0            21.929825
1                   0.0        0.0            21.929825
2                   0.0        0.0             0.000000
3                   0.0        0.0           512.848070
4                   0.0        0.0             0.000000
...                 ...        ...                  ...
1000093             0.0        0.0           347.235175
1000094             0.0        0.0           347.235175
1000095             0.0        0.0           347.235175
1000096             0.0        0.0             2.315000
1000097             0.0        0.0             2.315000

[1000098 rows x 3 columns]


In [1]:
import pandas as pd

# File paths
txt_file_path = "../assets/data/MachineLearningRating_v3.txt"  
csv_file_path = "../assets/data/MachineLearningRating_v3.csv"  

df = pd.read_csv(txt_file_path, delimiter='|')

# Save as a .csv file
df.to_csv(csv_file_path, index=False)

print(f"File has been converted and saved as: {csv_file_path}")


  df = pd.read_csv(txt_file_path, delimiter='|')


File has been converted and saved as: ../assets/data/MachineLearningRating_v3.csv


# Encoding categorical column 

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../assets/data/MachineLearningRating_v3.csv')

# Display the first few rows of the dataset
print("Original Dataset:")
print(df.head())

# Step 2: Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_cols)

# Step 3: Apply One-Hot Encoding
df_one_hot = pd.get_dummies(df, columns=categorical_cols)

# Save the one-hot encoded dataset to a new file (optional)
df_one_hot.to_csv("../assets/data/dataset_one_hot_encoded.csv", index=False)

print("\nOne-Hot Encoded Dataset:")
print(df_one_hot.head())




  df = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')


Original Dataset:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...      

In [None]:
# Step 4: Apply Label Encoding
label_encoder = LabelEncoder()

# Create a copy of the dataset for label encoding
df_label_encoded = df.copy()

# Apply label encoding to each categorical column
for col in categorical_cols:
    df_label_encoded[col + "_encoded"] = label_encoder.fit_transform(df_label_encoded[col])

# Save the label encoded dataset to a new file (optional)
df_label_encoded.to_csv("../assets/data/dataset_label_encoded.csv", index=False)

print("\nLabel Encoded Dataset:")
print(df_label_encoded.head())

# Data Splitting 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "../assets/data/MachineLearningRating_v3.csv"
data = pd.read_csv(file_path)


# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print("Training data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Save the datasets separately
train_data.to_csv("../assets/data/train_data.csv", index=False)
test_data.to_csv("../assets/data/test_data.csv", index=False)

print("Training and test data saved as 'train_data.csv' and 'test_data.csv'.")

  data = pd.read_csv(file_path)


Training data shape: (800078, 52)
Test data shape: (200020, 52)


# Modeling 

# Linear Regression

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import time

# Load the dataset
file_path = "../assets/data/train_data.csv"
data = pd.read_csv(file_path)

# Use only 10% of the data to reduce resource usage
data = data.sample(frac=0.1, random_state=42)

# Ensure only numeric columns are included for generating the synthetic target
numeric_data = data.select_dtypes(include=['number'])

# Generate a synthetic target column (Mean of numeric columns for each row)
data['synthetic_target'] = numeric_data.mean(axis=1)

# Drop rows with NaN values (if any were created during this process)
data = data.dropna()


# Check how many rows remain after dropping NaN values
print(f"Remaining rows after dropping NaN values: {len(data)}")

# Features and target
X = data.drop(columns=['synthetic_target'])
y = data['synthetic_target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
start_time = time.time()
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
print("Linear Regression:")
print("MSE:", mean_squared_error(y_test, lr_preds))
print("R2 Score:", r2_score(y_test, lr_preds))
print(f"Elapsed Time: {time.time() - start_time:.2f} seconds\n")


Remaining rows after dropping NaN values: 0


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

# Random Forest 

In [4]:
# Random Forest
start_time = time.time()
rf_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)  # Limited depth and estimators
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print("Random Forest:")
print("MSE:", mean_squared_error(y_test, rf_preds))
print("R2 Score:", r2_score(y_test, rf_preds))
print(f"Elapsed Time: {time.time() - start_time:.2f} seconds\n")

NameError: name 'X_train' is not defined

XGBoost 

In [None]:
# XGBoost
start_time = time.time()
xgb_model = xgb.XGBRegressor(n_estimators=50, max_depth=10, verbosity=1, random_state=42)  # Reduced complexity
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print("XGBoost:")
print("MSE:", mean_squared_error(y_test, xgb_preds))
print("R2 Score:", r2_score(y_test, xgb_preds))
print(f"Elapsed Time: {time.time() - start_time:.2f} seconds")

# model evaluation 

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
        start_time = time.time()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        mse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        
        elapsed_time = time.time() - start_time
        return mse, r2, mae, elapsed_time

    # Linear Regression Model Evaluation
lr_model = LinearRegression()
print("Linear Regression Evaluation:")lr_mse, lr_r2, lr_mae, lr_elapsed_time = evaluate_model(lr_model, X_train, X_test, y_train, y_test)print(f"MSE: {lr_mse}, R2 Score: {lr_r2}, MAE: {lr_mae}, Elapsed Time: {lr_elapsed_time:.2f} seconds\n")

    # Random Forest Model Evaluation
rf_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)  # Limited depth and estimators
print("Random Forest Evaluation:")
rf_mse, rf_r2, rf_mae, rf_elapsed_time = evaluate_model(rf_model, X_train, X_test, y_train, y_test)
print(f"MSE: {rf_mse}, R2 Score: {rf_r2}, MAE: {rf_mae}, Elapsed Time: {rf_elapsed_time:.2f} seconds\n")
