In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/Users/mac/Developer/Data Science Project/data/customer_training_dataset.csv',nrows=10000)
df = df.iloc[:,2:]

In [3]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Woman,10049.0,Spouse Present,1.0,Undergraduate,Freelancer,22.598761,City,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,Detached Home,2869.0
1,39.0,Woman,31678.0,Formerly Married,3.0,Graduate,,15.569731,Country,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,Detached Home,1483.0
2,23.0,Man,25602.0,Formerly Married,3.0,Secondary Education,Freelancer,47.177549,Countryside,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,Detached Home,567.0
3,21.0,Man,141855.0,Spouse Present,2.0,Undergraduate,,10.938144,Country,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Flat,765.0
4,21.0,Man,39651.0,Not Married,1.0,Undergraduate,Freelancer,20.376094,Country,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,Detached Home,2022.0


In [4]:
df = df[['Age','Gender','Annual Income','Education Level','Location','Policy Type','Customer Feedback','Property Type','Premium Amount']]
df

Unnamed: 0,Age,Gender,Annual Income,Education Level,Location,Policy Type,Customer Feedback,Property Type,Premium Amount
0,19.0,Woman,10049.0,Undergraduate,City,Premium,Poor,Detached Home,2869.0
1,39.0,Woman,31678.0,Graduate,Country,Comprehensive,Average,Detached Home,1483.0
2,23.0,Man,25602.0,Secondary Education,Countryside,Premium,Good,Detached Home,567.0
3,21.0,Man,141855.0,Undergraduate,Country,Basic,Poor,Flat,765.0
4,21.0,Man,39651.0,Undergraduate,Country,Premium,Poor,Detached Home,2022.0
...,...,...,...,...,...,...,...,...,...
9995,33.0,Woman,37223.0,Graduate,Country,Basic,Poor,Flat,675.0
9996,25.0,Man,34757.0,Secondary Education,Country,Basic,Good,Flat,736.0
9997,36.0,Woman,,Undergraduate,City,Comprehensive,Good,Detached Home,100.0
9998,41.0,Woman,2750.0,Secondary Education,Country,Basic,Average,Flat,464.0


In [5]:

x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
numerical_feature = [feature for feature in x.columns if x[feature].dtype != "O"]
categorical_feature = [feature for feature in x.columns if x[feature].dtype == "O"]

print(f"numerical_feature len is:->[ {len(numerical_feature)} ] - columns are:->{numerical_feature}")
print(f"numerical_feature len is:->[ {len(categorical_feature)} ] - columns are:->{categorical_feature}")

numerical_feature len is:->[ 2 ] - columns are:->['Age', 'Annual Income']
numerical_feature len is:->[ 6 ] - columns are:->['Gender', 'Education Level', 'Location', 'Policy Type', 'Customer Feedback', 'Property Type']


In [7]:
df.isna().sum()

Age                  143
Gender                 0
Annual Income        384
Education Level        0
Location               0
Policy Type            0
Customer Feedback    713
Property Type          0
Premium Amount         0
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,LabelEncoder,RobustScaler,MinMaxScaler

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(8000, 8) (8000,) (2000, 8) (2000,)


In [10]:
num_pipe = Pipeline(steps=[
    ('num_impute',SimpleImputer(strategy='median')),
    ('scaler',MinMaxScaler())
])

cat_pipe = Pipeline(steps=[
    ('cat_impute',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',drop='first',sparse_output=False))
])

tranformer = ColumnTransformer([
    ('num_trans',num_pipe,numerical_feature),
    ('cat_trans',cat_pipe,categorical_feature)
],remainder='passthrough',n_jobs=-1)

tranformer.fit(X_train)

x_train = tranformer.transform(X_train)
x_test = tranformer.transform(X_test)

In [11]:
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

rf = SVR()

rf.fit(x_train,y_train)

y_pred = rf.predict(x_test)

rscore = r2_score(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)


print(f"r2_score is : {rscore}")
print(f"mean_squared_error is : {mse}")
print(f"mean_absolute_error is : {mae}")


r2_score is : -0.06949477062406073
mean_squared_error is : 749478.3328061312
mean_absolute_error is : 628.4206949274791


In [12]:
from sklearn.model_selection import KFold,cross_val_score
from sklearn.ensemble import RandomForestRegressor

kf = KFold(n_splits=5,shuffle=True,random_state=42)
cv_result = cross_val_score(estimator=RandomForestRegressor(),X=x_train,y=y_train,cv=kf,n_jobs=-1)
cv_result

array([-0.07139304, -0.0900425 , -0.08297589, -0.11058815, -0.0900608 ])

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from src.logger import logging

logging.info('Proccess are ready to work')
df = pd.read_csv('/Users/mac/Developer/Data Science Project/data/customer_training_dataset.csv',nrows=50000)
df = df[['Age','Gender','Annual Income','Education Level','Location','Policy Type','Customer Feedback','Property Type','Premium Amount']]

logging.info('data are load successfull.')

df.drop_duplicates(inplace=True)

x = df.drop('Premium Amount',axis=1)
y = df['Premium Amount']

logging.info('spliting train and test.')
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

numerical_columns = ['Age',
                     'Annual Income']

categorical_columns = ['Gender',
                       'Location',
                       'Property Type']

num_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('ipute',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',drop='first',sparse_output=False))
])


education_level_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('education_level_encode',OrdinalEncoder(categories=['Secondary Education','Undergraduate','Graduate','PhD']))
])

policy_type_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('policy_type_encode',OrdinalEncoder(categories=['Basic','Comprehensive','Premium']))
])

Customer_Feedback_pipe = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('Customer_Feedback_encode',OrdinalEncoder(categories=['Poor','Average','Good']))
])

transformer = ColumnTransformer(transformers=[
    ('num_pipe',num_pipe,numerical_columns),
    ('cat_pipe',cat_pipe,categorical_columns),
    ('education_level_pipe',education_level_pipe,[3]),
    ('policy_type_pipe',policy_type_pipe,[5]),
    ('Customer_Feedback_pipe',Customer_Feedback_pipe,[6])
],
remainder='passthrough',
n_jobs=-1)

logging.info('transformation is going on.')

transformer.fit(x_train)

X_train = transformer.transform(x_train)
X_test = transformer.transform(x_test)

logging.info('transformation is done.')

rf = RandomForestRegressor()
rf.fit(X_train,y_train)
logging.info('model fit successfully.')
y_pred = rf.predict(X_test)

r2_s = r2_score(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
logging.info(f'all metrics are r2_score: {r2_s} - mean_squared_error: {mse} - mean_absolute_error: {mae}')

print('OK')


2025-05-18 15:44:33,127 - 1572159124 - INFO - Proccess are ready to work
2025-05-18 15:44:33,219 - 1572159124 - INFO - data are load successfull.
2025-05-18 15:44:33,235 - 1572159124 - INFO - spliting train and test.
2025-05-18 15:44:33,239 - 1572159124 - INFO - transformation is going on.


ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).