# data cleaning 

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load your data
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')

# Check for missing data
missing_data = data.isnull().sum()
print(f"Missing data per column:\n{missing_data}")

# Define a threshold for removing columns with too many missing values (e.g., more than 50%)
threshold = 0.5
columns_to_remove = missing_data[missing_data / len(data) > threshold].index
data_cleaned = data.drop(columns=columns_to_remove)

# For columns with less missing data, handle them differently:
# Impute numerical columns with the mean
numerical_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
numerical_imputer = SimpleImputer(strategy='mean')
data_cleaned[numerical_columns] = numerical_imputer.fit_transform(data_cleaned[numerical_columns])

# Impute categorical columns with the most frequent value (mode)
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
data_cleaned[categorical_columns] = categorical_imputer.fit_transform(data_cleaned[categorical_columns])

# Optional: Save the cleaned data
data_cleaned.to_csv('../assets/data/cleaned_dataset.csv', index=False)


  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')


Missing data per column:
UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
Num

# feature engineering 

In [3]:
import pandas as pd
# Load your data
data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')
# Create new features
data['ClaimFrequency'] = data['TotalClaims'] / (data['TotalPremium'] + 1)  # Added 1 to avoid division by zero
data['LossRatio'] = data['TotalClaims'] / (data['TotalPremium'] + 1)  # Added 1 for safety
data['ClaimsToPremiumDiff'] = data['TotalPremium'] - data['TotalClaims']
# Display the new features
print(data[['ClaimFrequency', 'LossRatio', 'ClaimsToPremiumDiff']])


  data = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')


         ClaimFrequency  LossRatio  ClaimsToPremiumDiff
0                   0.0        0.0            21.929825
1                   0.0        0.0            21.929825
2                   0.0        0.0             0.000000
3                   0.0        0.0           512.848070
4                   0.0        0.0             0.000000
...                 ...        ...                  ...
1000093             0.0        0.0           347.235175
1000094             0.0        0.0           347.235175
1000095             0.0        0.0           347.235175
1000096             0.0        0.0             2.315000
1000097             0.0        0.0             2.315000

[1000098 rows x 3 columns]


# Encoding categorical column 

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')

# Display the first few rows of the dataset
print("Original Dataset:")
print(df.head())

# Step 2: Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_cols)

# Step 3: Apply One-Hot Encoding
df_one_hot = pd.get_dummies(df, columns=categorical_cols)

# Save the one-hot encoded dataset to a new file (optional)
df_one_hot.to_csv("../assets/data/dataset_one_hot_encoded.csv", index=False)

print("\nOne-Hot Encoded Dataset:")
print(df_one_hot.head())

# Step 4: Apply Label Encoding
label_encoder = LabelEncoder()

# Create a copy of the dataset for label encoding
df_label_encoded = df.copy()

# Apply label encoding to each categorical column
for col in categorical_cols:
    df_label_encoded[col + "_encoded"] = label_encoder.fit_transform(df_label_encoded[col])

# Save the label encoded dataset to a new file (optional)
df_label_encoded.to_csv("../assets/data/dataset_label_encoded.csv", index=False)

print("\nLabel Encoded Dataset:")
print(df_label_encoded.head())


  df = pd.read_csv('../assets/data/MachineLearningRating_v3.txt', delimiter= '|')


Original Dataset:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...      

# Data Splitting 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "path/to/your/dataset.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)

# Perform the train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the train and test sets (optional)
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print("Train and test sets created successfully!")
print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")
