<a href="https://colab.research.google.com/github/ibitoladgr8/Machine-Learning/blob/main/regression_with_insurance_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")


In [3]:
train_df.dropna(subset=['Premium Amount'], inplace=True)

In [None]:
train_df.shape

(104437, 21)

In [4]:


# Assuming your DataFrame is named 'df'
train_df['Occupation'] = train_df['Occupation'].fillna('Unemployed')
test_df['Occupation'] = test_df['Occupation'].fillna('Unemployed')

train_df['Marital Status'] = train_df['Marital Status'].fillna('Single')
test_df['Marital Status'] = test_df['Marital Status'].fillna('Single')


#test_df['Gender'] = test_df['Gender'].fillna('Male')

In [None]:
for column in train_df.columns[train_df.dtypes == 'object']:
    unique_elements = train_df[column].unique()
    print(f"Unique elements in column '{column}': {unique_elements}")

Unique elements in column 'Gender': ['Female' 'Male']
Unique elements in column 'Marital Status': ['Married' 'Divorced' 'Single']
Unique elements in column 'Education Level': ["Bachelor's" "Master's" 'High School' 'PhD']
Unique elements in column 'Occupation': ['Self-Employed' 'Unemployed' 'Employed']
Unique elements in column 'Location': ['Urban' 'Rural' 'Suburban']
Unique elements in column 'Policy Type': ['Premium' 'Comprehensive' 'Basic']
Unique elements in column 'Policy Start Date': ['2023-12-23 15:21:39.134960' '2023-06-12 15:21:39.111551'
 '2023-09-30 15:21:39.221386' ... '2021-02-12 15:21:39.172097'
 '2020-11-10 15:21:39.251142' '2024-08-01 15:21:39.288099']
Unique elements in column 'Customer Feedback': ['Poor' 'Average' 'Good' nan]
Unique elements in column 'Smoking Status': ['No' 'Yes']
Unique elements in column 'Exercise Frequency': ['Weekly' 'Monthly' 'Daily' 'Rarely']
Unique elements in column 'Property Type': ['House' 'Apartment' 'Condo']


In [5]:
train_df['Policy Start Date'] = pd.to_datetime(train_df['Policy Start Date']) # Now the column is in datetime format print(train_df['Policy Start Date'

In [None]:
train_df.dtypes

Unnamed: 0,0
id,int64
Age,float64
Gender,object
Annual Income,float64
Marital Status,object
Number of Dependents,float64
Education Level,object
Occupation,object
Health Score,float64
Location,object


In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

In [7]:
X= train_df.drop(['Premium Amount','id'], axis=1)
y= train_df['Premium Amount']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns
                    if X_train[cname].dtype == "object"
                    and cname not in ['Educational Level', 'Location']]

ordinal_cols = ['Education Level', 'Location']

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols+ordinal_cols




In [None]:
X_train.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
137672,24.0,Male,15774.0,Divorced,4.0,PhD,Unemployed,28.368531,Urban,Basic,2.0,11.0,733.0,5.0,2020-06-10 15:21:39.148733,Average,Yes,Monthly,Apartment
786837,50.0,Male,11986.0,Divorced,4.0,Master's,Unemployed,51.441975,Rural,Premium,1.0,17.0,,8.0,2019-09-21 15:21:39.272456,Good,No,Monthly,Apartment
559710,40.0,Male,71818.0,Single,0.0,PhD,Employed,10.007516,Rural,Basic,,10.0,446.0,3.0,2023-07-08 15:21:39.131192,,Yes,Rarely,Apartment
127788,38.0,Male,39146.0,Married,4.0,Master's,Unemployed,16.569715,Urban,Basic,,2.0,622.0,8.0,2020-08-14 15:21:39.244069,Good,No,Daily,Condo
244109,18.0,Female,31002.0,Divorced,3.0,High School,Employed,42.735419,Rural,Premium,,1.0,462.0,8.0,2020-08-23 15:21:39.147735,Poor,Yes,Rarely,Condo


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', RobustScaler()),  # RobustScaler for outlier handling
    ('feature_selection', SelectKBest(score_func=f_classif, k=5)),  # Keep top 5 features
    ('pca', PCA(n_components=3))  # Dimensionality reduction to 3 components
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('ord', ordinal_transformer, ordinal_cols)
    ])

preprocessor


In [9]:
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train.values.reshape(-1, 1)) # Access the NumPy array using .values
y_valid = scaler.transform(y_valid.values.reshape(-1, 1)) # Access the NumPy array using .values

In [14]:
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(random_state=42))
])

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of trees
    'classifier__max_depth': [None, 10, 20],  # Tree depth
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples to split
    'classifier__min_samples_leaf': [1, 2, 4]  # Minimum samples at a leaf node
}

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit the GridSearchCV pipeline on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_valid, y_valid)
print("Test Set Accuracy:", test_score)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
from sklearn.metrics import mean_absolute_error
model1 = RandomForestRegressor()
pipeline1 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model1)
                             ])
pipeline1.fit(X_train, y_train)
pred1 =pipeline1.predict(X_valid)

score1 = mean_absolute_error(y_valid, pred1)
print('MAE:', score1)

  return fit_method(estimator, *args, **kwargs)


MAE: 0.76291356291854


In [None]:
test_df['Policy Start Date'] = pd.to_datetime(test_df['Policy Start Date']) # Now the column is in datetime format print(train_df['Policy Start Date'

prediction=pipeline1.predict(test_df)

In [None]:
prediction_rf = scaler.inverse_transform(prediction.reshape(-1, 1))

In [None]:
submissionrf = pd.DataFrame({'id': test_df['id'], 'Premium Amount': prediction_rf.flatten()})
submissionrf.to_csv('submission.csv', index=False)

In [12]:
from xgboost import XGBRegressor

model2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
pipeline2 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model2)
                             ])
pipeline2.fit(X_train, y_train)
pred2 =pipeline2.predict(X_valid)



  y = column_or_1d(y, warn=True)


NameError: name 'mean_absolute_error' is not defined

In [13]:
from sklearn.metrics import mean_absolute_error
score2 = mean_absolute_error(y_valid, pred2)
print('MAE:', score2)

MAE: 0.7688952892537215


In [None]:
xgbpred = pipeline2.predict(test_df)

In [None]:
pd.DataFrame({'id': test_df['id'], 'Premium Amount': xgbpred}).to_csv('submission2.csv', index=False)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2


In [None]:
feature_names = preprocessor.get_feature_names_out()

# Calculate the number of features after preprocessing
num_features_after_preprocessing = len(feature_names)

# Update the input shape of your model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[num_features_after_preprocessing]), # Updated input shape
    tf.keras.layers.Dropout(0.2),
    layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    layers.BatchNormalization(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Re-compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
from sklearn.metrics import mean_absolute_error

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.named_steps['model'].fit(
    my_pipeline.named_steps['preprocessor'].transform(X_train),
    y_train,
    batch_size=512,  # Add batch size
    epochs=1000,      # Add epochs
    callbacks=[early_stopping],  # Add callbacks
    verbose=0         # Add verbose
)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

  current = self.get_monitor_value(logs)


KeyboardInterrupt: 