# Random Forest, KNN, and Linear Regression for Car Insurance Policy Dataset

In [46]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, precision_score, accuracy_score, recall_score, confusion_matrix, f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns  
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import pyplot
import os
from pathlib import Path


# Data Preprocessing

In [49]:
def preprocess_data(df):
    def extract_month(age):
        try:
            year_month = age.split(' and ')
            year = int(year_month[0].split()[0])
            month = int(year_month[1].split()[0])
            month += 12*year
        except:
            month = int(age.split()[0])
        return month
    
    def extract_power(power_rpm):
        power = float(power_rpm.split('bhp@')[0])
        return power

    def extract_torque(torque_rpm):
        torque = float(torque_rpm.split('Nm@')[0])
        return torque
        
    df.drop(columns=['engine_type',
                     'is_power_steering'], inplace=True)

    df.dropna(inplace=True)

    # Convert Yes to 1 and No to 0
    df.replace('Yes', 1, inplace=True)
    df.replace('No', 0, inplace=True)
    
    # Convert columns to numerical and One-Hot Encoding
    df = df.assign(
        age_of_car=df['age_of_car'].apply(lambda x: extract_month(x)),
        max_power=df['max_power'].apply(lambda x: extract_power(x)),
        max_torque=df['max_torque'].apply(lambda x: extract_torque(x)),        
        rear_brakes_type=np.where(df.rear_brakes_type=='Disc', 1, 0),
        transmission_type=np.where(df.transmission_type == 'Automatic', 1, 0),
        area_cluster_C1=np.where(df.area_cluster == 'C1', 1, 0),
        area_cluster_C2=np.where(df.area_cluster == 'C2', 1, 0),
        area_cluster_C3=np.where(df.area_cluster == 'C3', 1, 0),
        area_cluster_C4=np.where(df.area_cluster == 'C4', 1, 0),
        area_cluster_C5=np.where(df.area_cluster == 'C5', 1, 0),
        area_cluster_C6=np.where(df.area_cluster == 'C6', 1, 0),
        area_cluster_C7=np.where(df.area_cluster == 'C7', 1, 0),
        area_cluster_C8=np.where(df.area_cluster == 'C8', 1, 0),
        area_cluster_C9=np.where(df.area_cluster == 'C9', 1, 0),
        area_cluster_C10=np.where(df.area_cluster == 'C10', 1, 0),
        area_cluster_C11=np.where(df.area_cluster == 'C11', 1, 0),
        area_cluster_C12=np.where(df.area_cluster == 'C12', 1, 0),
        area_cluster_C13=np.where(df.area_cluster == 'C13', 1, 0),
        area_cluster_C14=np.where(df.area_cluster == 'C14', 1, 0),
        area_cluster_C15=np.where(df.area_cluster == 'C15', 1, 0),
        area_cluster_C16=np.where(df.area_cluster == 'C16', 1, 0),
        area_cluster_C17=np.where(df.area_cluster == 'C17', 1, 0),
        area_cluster_C18=np.where(df.area_cluster == 'C18', 1, 0),
        area_cluster_C19=np.where(df.area_cluster == 'C19', 1, 0),
        area_cluster_C20=np.where(df.area_cluster == 'C20', 1, 0),
        area_cluster_C21=np.where(df.area_cluster == 'C21', 1, 0),
        area_cluster_C22=np.where(df.area_cluster == 'C22', 1, 0),
        model_M1=np.where(df.model == 'M1', 1, 0),
        model_M2=np.where(df.model == 'M2', 1, 0),
        model_M3=np.where(df.model == 'M3', 1, 0),
        model_M4=np.where(df.model == 'M4', 1, 0),
        model_M5=np.where(df.model == 'M5', 1, 0),
        model_M6=np.where(df.model == 'M6', 1, 0),
        model_M7=np.where(df.model == 'M7', 1, 0),
        model_M8=np.where(df.model == 'M8', 1, 0),
        model_M9=np.where(df.model == 'M9', 1, 0),
        model_M10=np.where(df.model == 'M10', 1, 0),
        model_M11=np.where(df.model == 'M11', 1, 0),
        fuel_type_Petrol=np.where(df.fuel_type == 'Petrol', 1, 0),
        fuel_type_Diesel=np.where(df.fuel_type == 'Diesel', 1, 0),
        fuel_type_CNG=np.where(df.fuel_type == 'CNG', 1, 0),
        steering_type_Manual=np.where(df.steering_type == 'Manual', 1, 0),
        steering_type_Power=np.where(df.steering_type == 'Power', 1, 0),
        steering_type_Electric=np.where(df.steering_type == 'Electric', 1, 0),
        make_1=np.where(df.make == 1, 1, 0),
        make_2=np.where(df.make == 2, 1, 0),
        make_3=np.where(df.make == 3, 1, 0),
        make_4=np.where(df.make == 4, 1, 0),
        make_5=np.where(df.make == 5, 1, 0),
        segment_A=np.where(df.segment == 'A', 1, 0),
        segment_B1=np.where(df.segment == 'B1', 1, 0),
        segment_B2=np.where(df.segment == 'B2', 1, 0),
        segment_C1=np.where(df.segment == 'C1', 1, 0),
        segment_C2=np.where(df.segment == 'C2', 1, 0),
        segment_U=np.where(df.segment == 'Utility', 1, 0)
    )

    df.drop(columns=['area_cluster', 'model', 'make', 'segment', 'fuel_type', 'steering_type'], inplace=True)
    
    return df

    

In [50]:
def drop_features_clf(df):
    new = df.drop(columns=['population_density',
                     'max_torque',
                     'max_power',
                     'cylinder',
                     'length',
                     'width',
                     'height',
                     'make_1',
                     'make_2',
                     'make_3',
                     'make_4',
                     'make_5',
                     'segment_A',
                     'segment_B1',
                     'segment_B2',
                     'segment_C1',
                     'segment_C2',
                     'segment_U',
                     'airbags',
                     'is_esc',
                     'is_adjustable_steering',
                     'is_tpms',
                     'is_parking_sensors',
                     'is_parking_camera',
                     'rear_brakes_type',
                     'displacement',
                     'transmission_type',
                     'gear_box',
                     'turning_radius',
                     'gross_weight',
                     'is_front_fog_lights',
                     'is_rear_window_wiper',
                     'is_rear_window_washer',
                     'is_rear_window_defogger',
                     'is_brake_assist',
                     'is_power_door_locks',
                     'is_central_locking',
                     'is_driver_seat_height_adjustable',
                     'is_day_night_rear_view_mirror',
                     'is_ecw',
                     'is_speed_alert',
                     'age_of_policyholder',
                     'ncap_rating'])    
    return new
    

In [51]:
def drop_features_reg(df):
    new = df.drop(columns=['population_density',
                     'max_torque',
                     'max_power',
                     'cylinder',
                     'length',
                     'width',
                     'height',
                     'make_1',
                     'make_2',
                     'make_3',
                     'make_4',
                     'make_5',
                     'segment_A',
                     'segment_B1',
                     'segment_B2',
                     'segment_C1',
                     'segment_C2',
                     'segment_U',
                     'airbags',
                     'is_esc',
                     'is_adjustable_steering',
                     'is_tpms',
                     'is_parking_sensors',
                     'is_parking_camera',
                     'rear_brakes_type',
                     'displacement',
                     'transmission_type',
                     'gear_box',
                     'turning_radius',
                     'gross_weight',
                     'is_front_fog_lights',
                     'is_rear_window_wiper',
                     'is_rear_window_washer',
                     'is_rear_window_defogger',
                     'is_brake_assist',
                     'is_power_door_locks',
                     'is_central_locking',
                     'is_driver_seat_height_adjustable',
                     'is_day_night_rear_view_mirror',
                     'is_ecw',
                     'is_speed_alert',
                     'ncap_rating',
                     'fuel_type_Petrol',
                     'fuel_type_Diesel',
                     'fuel_type_CNG',
                     'steering_type_Manual',
                     'steering_type_Power',
                     'steering_type_Electric'])    
    return new
    

In [52]:
def find_correlated_features(df, threshold, target_variable):
    # remove target column
    df = df.drop(columns=target_variable).copy()
    # Get correlation matrix
    corr_matrix = df.corr().abs()
    # Take half of the matrix to prevent doubling results
    corr_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
    # Restructure correlation matrix to dataframe
    df = corr_matrix.stack().reset_index()
    df.columns = ['feature1', 'feature2', 'corr_coef']
    # Apply filter and sort coefficients
    df = df[df.corr_coef >= threshold].sort_values('corr_coef', ascending=False)
    return df

In [53]:
def remove_correlated_feature(df, threshold, target_variable, removed_features):
    while True:
        # Get correlation matrices
        features_corr = df.drop(columns=target_variable).corr().abs()
        target_corr = df.corr().abs()

        # Take half of the matrix to prevent doubling results
        features_corr = features_corr.where(np.triu(np.ones(features_corr.shape), k = 1).astype(bool))

        # Restructure correlation matrix to dataframe
        features_corr = features_corr.stack().reset_index()
        features_corr.columns = ['feature1', 'feature2', 'corr_coef']
        target_corr = target_corr.reset_index()[['index', target_variable]]
        target_corr.columns = ['feature', 'corr_coef']

        # Get top 1 result for correlated feature
        features_corr = features_corr[features_corr.corr_coef >= threshold].sort_values('corr_coef', ascending=False)
        
        # Exit loop if no more correlated features
        if features_corr.empty:
            break
        features_corr = features_corr.iloc[0]

        # Get features correlation coefficient to target variable for reference
        target_corr = target_corr.sort_values('corr_coef', ascending=False)

        target_corr.to_csv(f'target_corr_age.csv', index=False)
            
        # Reference feature correlation to target, drop the correlation feature with lower correlation to target
        feature1 = features_corr['feature1']
        feature2 = features_corr['feature2']

        feature1_target = float(target_corr[target_corr.feature == feature1]['corr_coef'].iloc[0])
        feature2_target = float(target_corr[target_corr.feature == feature2]['corr_coef'].iloc[0])

        if feature1_target > feature2_target:
            df.drop(columns=feature2, inplace=True)
            removed_features.append(feature2)
        else:
            df.drop(columns=feature1, inplace=True)
            removed_features.append(feature1)
        
    return df


In [363]:
def get_sample_weight(df, target_variable):
    total_observations = df.shape[0]
    observations_in_classes = df.groupby(target_variable).size().to_frame('observations')
    number_of_classes = len(observations_in_classes)
    sample_weight_df = df
    sample_weight_df = sample_weight_df.join(observations_in_classes, on=target_variable)
    sample_weight_df = sample_weight_df.assign(sample_weight=sample_weight_df['observations'].apply(lambda x: (1/number_of_classes)/(x/total_observations)))
    return sample_weight_df['sample_weight']
    

In [364]:
def histogram(data, feature_name, num_bins='auto'):
    plt.hist(data, bins=num_bins)
    plt.title(feature_name+' Histogram')    
    plt.xlabel(feature_name)
    plt.ylabel('Count')
    plt.savefig(feature_name + '_histogram.png')
    plt.clf()


# Age of Policy Holder Linear Regression

In [387]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

In [388]:
df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)

In [389]:
removed_features = []
df_train_age = remove_correlated_feature(df_train.drop(columns=['policy_id']), 0.6, 'age_of_policyholder', removed_features)

In [391]:
df_test_age = df_test
df_test_age.drop(columns=removed_features, inplace=True)

In [219]:
df_train_age = drop_features_reg(df_train)
df_test_age = drop_features_reg(df_test)

y_age_train = df_train_age['age_of_policyholder']
x_age_train = df_train_age.drop(columns=['policy_id', 'age_of_policyholder'])

y_age_test = df_test_age['age_of_policyholder']
x_age_test = df_test_age.drop(columns=['policy_id', 'age_of_policyholder'])

In [1005]:
poly_age = PolynomialFeatures(1)
x_age_train = poly_age.fit_transform(x_age_train)
x_age_test = poly_age.fit_transform(x_age_test)

In [220]:
# Create a linear regression model
reg = LinearRegression()

# Train the model using the training data
reg.fit(x_age_train, y_age_train)

# Predict the revenue for the test data
y_pred = reg.predict(x_age_test)
y_pred_df = pd.DataFrame(df_test['policy_id'], y_pred, columns=['policy_id', 'age'])

# Evaluate the performance of the model using mean squared error and R-squared values
mse = mean_squared_error(y_age_test, y_pred)
r2 = r2_score(y_age_test, y_pred)

print("Mean squared error: ", mse)
print("R-squared value: ", r2)
print("Coefficients:", reg.coef_)

Mean squared error:  91.88670078355715
R-squared value:  0.051080473531282644
Coefficients: [ 0.04378134 -0.0184691   0.48825327 -0.1740999  -0.34986747 -0.73324617
 -1.15086932  1.08002324  0.04936346 -2.39841338  1.06303974  0.7496838
  0.45813433  1.70373299  0.83160529 -0.5219798   0.82371444 -0.7164149
 -0.12416363 -0.25045231 -1.10580691  0.88774998 -0.43512274  0.50096439
 -0.18757513  2.10630244 -0.44999139 -0.46714715  0.54698415  0.73957187
  1.41378531 -0.19227775 -1.81557825 -1.29635982 -0.68774191  0.10245249]


# Age of Policy Holder Random Forest Regression

In [475]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)
df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)

In [476]:
df_train_age = drop_features_reg(df_train)
df_test_age = drop_features_reg(df_test)

In [481]:
y_age_train = df_train_age['age_of_policyholder']
x_age_train = df_train_age.drop(columns=['policy_id', 'age_of_policyholder'])

y_age_test = df_test_age['age_of_policyholder']
x_age_test = df_test_age.drop(columns=['policy_id', 'age_of_policyholder'])

In [483]:
sample_weight = get_sample_weight(df_train_age, 'age_of_policyholder')
sample_weight

15194    0.607903
26329    0.760688
1174     0.607903
55371    0.607903
45259    0.319163
           ...   
5264     0.607903
23758    0.308414
29765    0.926441
23048    2.006421
34779    0.634357
Name: sample_weight, Length: 40000, dtype: float64

In [485]:
def get_optimum_depth_reg(x_age_train, y_age_train, x_age_test, y_age_test):
    mse = [99999]
    for n in range(1, 101):
        rfr = RandomForestRegressor(max_depth=n, max_features=12, n_jobs=-1, random_state=0)

        # Train the model using the training data
        rfr.fit(x_age_train, y_age_train)

        # Predict the age for the test data
        y_pred = rfr.predict(x_age_test)

        mse.append(mean_squared_error(y_age_test, y_pred))
    
    optimum_depth = mse.index(min(mse))
    
    return optimum_depth, mse
    

In [486]:
def get_optimum_max_features_reg(x_age_train, y_age_train, x_age_test, y_age_test):
    mse = [99999]
    for n in range(1, 36):
        rfr = RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=0, max_features=n)

        # Train the model using the training data
        rfr.fit(x_age_train, y_age_train)

        # Predict the age for the test data
        y_pred = rfr.predict(x_age_test)

        mse.append(mean_squared_error(y_age_test, y_pred))
    
    optimum_mf = mse.index(min(mse))
    
    return optimum_mf, mse

In [376]:
opt_mf_reg, mse_reg_f = get_optimum_max_features_reg(x_age_train, y_age_train, x_age_test, y_age_test)
opt_mf_reg

12

In [377]:
mf_reg_df = pd.DataFrame({'mse':mse_reg_f})
mf_reg_df.reset_index()
mf_reg_df.to_csv('mse_max_features.csv', index=True)

In [385]:
opt_depth_reg, mse_reg_d = get_optimum_depth_reg(x_age_train, y_age_train, x_age_test, y_age_test)
opt_depth_reg

10

In [386]:
depth_reg_df = pd.DataFrame({'mse':mse_reg_d})
depth_reg_df.reset_index()
depth_reg_df.to_csv('mse_depth.csv', index=True)

In [488]:
# Create a random forrest regression model
rfr = RandomForestRegressor(max_depth=10, max_features=12, n_jobs=-1, random_state=0)

# Train the model using the training data
rfr.fit(x_age_train, y_age_train)

# Predict the age for the test data
y_pred = rfr.predict(x_age_test)
y_pred_df = pd.DataFrame(df_test['policy_id'], y_pred, columns=['policy_id', 'age'])
# y_pred_df.to_csv(f'{studentid}.PART1.output.csv', index=False)


# Evaluate the performance of the model using mean squared error and R-squared values
mse = mean_squared_error(y_age_test, y_pred)
r2 = r2_score(y_age_test, y_pred)

print("Mean squared error: ", mse)
print("R-squared value: ", r2)

Mean squared error:  91.10831876231225
R-squared value:  0.05911887182736819


# is_claim KNN Classification

In [721]:
removed_features_claim = []
df_train_claim = remove_correlated_feature(df_train.drop(columns=['policy_id']), 0.6, 'is_claim', removed_features_claim)

In [723]:
df_test_claim = df_test
df_test_claim = df_test_claim.drop(columns=removed_features_claim)

In [724]:
y_claim_train = df_train_claim['is_claim']
x_claim_train = df_train_claim.drop(columns=['is_claim'])

y_claim_test = df_test_claim['is_claim']
x_claim_test = df_test_claim.drop(columns=['policy_id', 'is_claim'])

In [599]:
poly_claim = PolynomialFeatures(1)
x_claim_train = poly_claim.fit_transform(x_claim_train)
x_claim_test = poly_claim.fit_transform(x_claim_test)

In [600]:
# train a classifier
knn = KNeighborsClassifier()
knn.fit(x_claim_train, y_claim_train)

# predict the test set
pred_claim = knn.predict(x_claim_test)

pred_claim_df = pd.DataFrame(pred_claim, columns=['predicted_ratinge'])
# pred_rating_df.to_csv(f'z{id}.PART2.output.csv', index=False)


print("confusion_matrix:\n", confusion_matrix(y_claim_test, pred_claim))
print("precision:\t", precision_score(y_claim_test, pred_claim, average='macro'))
print("recall:\t\t", recall_score(y_claim_test, pred_claim, average='macro'))
print("accuracy:\t", accuracy_score(y_claim_test, pred_claim))
print("f1:\t", f1_score(y_claim_test, pred_claim, average='macro'))

confusion_matrix:
 [[4664   17]
 [ 317    2]]
precision:	 0.5208106594532909
recall:		 0.5013189449244226
accuracy:	 0.9332
f1:	 0.48863295359481845


# is_claim Random Forest Classification

In [10]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)

In [11]:
df_train_claim = df_train.drop(columns=['policy_id'])
df_test_claim = df_test.drop(columns=['policy_id'])

y_claim_train = df_train_claim['is_claim']
x_claim_train = df_train_claim.drop(columns=['is_claim'])

y_claim_test = df_test_claim['is_claim']
x_claim_test = df_test_claim.drop(columns=['is_claim'])

In [11]:
df_train_claim = drop_features_clf(df_train).drop(columns=['policy_id'])
df_test_claim =  drop_features_clf(df_test).drop(columns=['policy_id'])

y_claim_train = df_train_claim['is_claim']
x_claim_train = df_train_claim.drop(columns=['is_claim'])

y_claim_test = df_test_claim['is_claim']
x_claim_test = df_test_claim.drop(columns=['is_claim'])

In [12]:
poly_claim = PolynomialFeatures(2)
x_claim_train = poly_claim.fit_transform(x_claim_train)
x_claim_test = poly_claim.fit_transform(x_claim_test)

In [14]:
def get_optimum_depth(x_claim_train, y_claim_train, x_claim_test, y_claim_test):
    f1_scores = [0]
    for n in range(1, 101):
        rf = RandomForestClassifier(max_depth=n, n_jobs=-1, class_weight='balanced_subsample', random_state=0, max_features=10)
        rf.fit(x_claim_train, y_claim_train)

        # predict the test set
        pred_claim = rf.predict(x_claim_test)

        f1_scores.append(f1_score(y_claim_test, pred_claim, average='macro'))
    
    optimum_depth = f1_scores.index(max(f1_scores))
    
    return optimum_depth, f1_scores
    

In [15]:
def get_optimum_max_features(x_claim_train, y_claim_train, x_claim_test, y_claim_test):
    f1_scores = [0]
    for n in range(1, 903):
        rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', random_state=0, max_features=n)
        rf.fit(x_claim_train, y_claim_train)

        # predict the test set
        pred_claim = rf.predict(x_claim_test)

        f1_scores.append(f1_score(y_claim_test, pred_claim, average='macro'))
    
    optimum_mf = f1_scores.index(max(f1_scores))
    
    return optimum_mf, f1_scores
    

In [25]:
opt_depth, f1_depth = get_optimum_depth(x_claim_train, y_claim_train, x_claim_test, y_claim_test)

In [26]:
opt_depth

20

In [816]:
f1_depth_df = pd.DataFrame({'score':f1_depth})

In [820]:
f1_depth_df.reset_index()
f1_depth_df.to_csv('f1_depth.csv', index=True)

In [13]:
# train a classifier
rf = RandomForestClassifier(max_depth=21, n_jobs=-1, class_weight='balanced_subsample', random_state=0, max_features=10)
rf.fit(x_claim_train, y_claim_train)

# predict the test set
pred_claim = rf.predict(x_claim_test)

pred_claim_df = pd.DataFrame(pred_claim, columns=['predicted_claim'])
# pred_rating_df.to_csv(f'z{id}.PART2.output.csv', index=False)


print("confusion_matrix:\n", confusion_matrix(y_claim_test, pred_claim))
print("precision:\t", precision_score(y_claim_test, pred_claim, average='macro'))
print("recall:\t\t", recall_score(y_claim_test, pred_claim, average='macro'))
print("accuracy:\t", accuracy_score(y_claim_test, pred_claim))
print("f1:\t", f1_score(y_claim_test, pred_claim, average='macro'))

confusion_matrix:
 [[4195  486]
 [ 239   80]]
precision:	 0.5437205436308895
recall:		 0.5734798649111092
accuracy:	 0.855
f1:	 0.5506258696938987


In [491]:
def get_feature_importance(model, data):
    feature_names = data
    feature_importance = model.feature_importances_
    feature_importance_df = pd.DataFrame({'feature':feature_names, 'importance':feature_importance})
    feature_importance__df = feature_importance_df.sort_values('importance',ascending=False).reset_index(drop=True)
    return feature_importance__df

# Feature Importance for Age and is_claim (Random Forest)

In [501]:
feature_importance_claim_df = get_feature_importance(rf, poly_claim.get_feature_names_out())
print(feature_importance_claim_df.head(20))
feature_importance_claim_df.to_csv('feature_importance_claim.csv', index=False)

                                 feature  importance
0                        policy_tenure^2    0.079063
1                          policy_tenure    0.079051
2               policy_tenure age_of_car    0.077680
3                           age_of_car^2    0.045591
4      policy_tenure steering_type_Power    0.045333
5                             age_of_car    0.044985
6   policy_tenure steering_type_Electric    0.032595
7            policy_tenure fuel_type_CNG    0.026651
8         policy_tenure fuel_type_Petrol    0.026272
9         age_of_car steering_type_Power    0.024901
10                policy_tenure model_M1    0.021730
11        policy_tenure fuel_type_Diesel    0.021549
12         policy_tenure area_cluster_C8    0.018871
13                policy_tenure model_M6    0.018529
14     age_of_car steering_type_Electric    0.018202
15                policy_tenure model_M4    0.017848
16           age_of_car fuel_type_Petrol    0.016737
17              age_of_car fuel_type_CNG    0.

In [502]:
feature_importance_age_df = get_feature_importance(rfr, list(x_age_train.columns))
print(feature_importance_age_df.head(20))
feature_importance_age_df.to_csv('feature_importance_claim.csv', index=False)

             feature  importance
0      policy_tenure    0.442683
1         age_of_car    0.196429
2           model_M1    0.044932
3           model_M8    0.041639
4           model_M6    0.030875
5    area_cluster_C7    0.029006
6           is_claim    0.022675
7    area_cluster_C8    0.021274
8    area_cluster_C5    0.018711
9   area_cluster_C11    0.015663
10          model_M9    0.014221
11          model_M4    0.012315
12   area_cluster_C3    0.012038
13  area_cluster_C13    0.009212
14          model_M3    0.007926
15   area_cluster_C9    0.007878
16   area_cluster_C2    0.007030
17  area_cluster_C14    0.006323
18          model_M7    0.005462
19  area_cluster_C12    0.005352
