In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from pandas.plotting import scatter_matrix
import seaborn as sns
from IPython.display import set_matplotlib_formats, HTML
from matplotlib.dates import DateFormatter
import matplotlib_inline 
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from matplotlib import colors as mcolors
from pandas.plotting import register_matplotlib_converters
import plotly.express as px
%matplotlib inline
%config InlineBackend.figure_format = 'png'
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore') 

In [3]:
# Formating Plots
# default styles
def set_sns_format(width=14, height=8):
    sns.set_theme(palette='pastel', context='notebook',rc={'savefig.dpi':300})
    matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
    matplotlib.rcParams['figure.figsize'] = (width, height)
    return None
set_sns_format(width=14, height=8)

In [4]:
def add_value_labels(ax, typ, spacing=5):
    #This function add the labels in the bar and line plots
    #input the ax to add the labels, the type of plot
    
    space = spacing
    va = 'bottom'
    

    if typ == 'bar':
        for i in ax.patches:
            y_value = i.get_height()
            x_value = i.get_x() + i.get_width() / 2

            label = "{:.0f}".format(y_value)
            ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)     

    if typ == 'line':
        for line in ax.lines:
            for x_value, y_value in zip(line.get_xdata(), line.get_ydata()):
                label = "{:.0f}".format(y_value)
                ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)

In [5]:
df = pd.read_csv(r"C:\Users\ssai\OneDrive\Data_26-07\labs\lab-comparing-regression-models\files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv")

In [6]:
df = df.drop("Customer", axis = 1)
df.rename(columns=lambda col: col.replace(' ', '_').lower(), inplace=True)
df['effective_to_date'] = pd.to_datetime(df['effective_to_date'])
numerical_df = df.select_dtypes(np.number)
categorical_df = df.select_dtypes(object)

In [7]:
def drop_outlier(data,column):
    print("old shape: ", data.shape)                    # dataframe shape before drops
        
    Q1 = data[column].quantile(0.25) # first quantile
    Q3 = data[column].quantile(0.75) # third quantile
    IQR = Q3 - Q1                    # inter quantile range

    data = data[~(                        # negation so we get the datapoints within the whiskers
        (data[column] < (Q1 - 1.5 * IQR)) # datapoints left of the "left whisker"
        |(data[column] > (Q3 + 1.5 * IQR) # datapoints right of the 'right whisker'
        ))]

    print("new shape: ", data.shape)                    # data frame shape after drops
    data.reset_index(drop=True, inplace=True)           # reset the index (to avoid NaNs later)
    return data

In [8]:
df = drop_outlier(df,"total_claim_amount")

old shape:  (9134, 23)
new shape:  (8681, 23)


### Create a copy of the Datasets

In [9]:
df2 = df.copy()
numerical_df2 = df2.select_dtypes(np.number)
categorical_df2 = df2.select_dtypes(object)

### Normalize Continuos variables

In [10]:
from sklearn.preprocessing import MinMaxScaler
min_max_transformer = MinMaxScaler()

In [11]:
x = numerical_df2.values
x_scaled = min_max_transformer.fit_transform(x)
numerical_df2=pd.DataFrame(x_scaled, columns=numerical_df2.columns)

In [12]:
categorical_df2["coverage"] = categorical_df2["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
categorical_df2["employmentstatus"] = categorical_df2["employmentstatus"].map({"Employed" : 0, "Unemployed" : 1, "Medical Leave" : 2, "Disabled" : 3, "Retired" : 4})
categorical_df2["location_code"] = categorical_df2["location_code"].map({"Suburban" : 0, "Rural" : 1, "Urban" : 2})
categorical_df2["vehicle_size"] = categorical_df2["vehicle_size"].map({"Small" : 0, "Medsize" : 1, "Large" : 2})

In [13]:

dummies_list = ["state", "marital_status", "policy_type", "policy", "renew_offer_type","sales_channel", "vehicle_size"]

In [14]:
categorical_df2 = pd.get_dummies(categorical_df2, prefix=dummies_list, columns=dummies_list, drop_first = True)

In [15]:
time_df2 = pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
time_df2["day"] = df2['effective_to_date'].dt.day
time_df2["month"] = df2['effective_to_date'].dt.month
time_df2["year"] = df2['effective_to_date'].dt.year


In [16]:
df2 = pd.concat([categorical_df2, numerical_df2, time_df2], axis =1)


In [17]:
dummies_list = ["response", "education", "gender", "vehicle_class"]
df2 = pd.get_dummies(df2, prefix=dummies_list, columns=dummies_list, drop_first = True)

##  Train Test Split


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X = df2[df2.columns.drop("total_claim_amount")] # features          
y = df2["total_claim_amount"] 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 42)

## Linear Regression Model

In [20]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [21]:
reg.fit(X_train, y_train)


In [22]:
from sklearn.metrics import r2_score
predictions_train = reg.predict(X_train) # create predictions for our train data
predictions_test = reg.predict(X_test)   # create predictions for our test data

r2_train = r2_score(y_train, predictions_train) # calculate r2 score for train data
r2_test = r2_score(y_test, predictions_test)    # calculate r2 score for test data

print('R2 value for train: {}'.format(r2_train))
print('R2 value for test: {}'.format(r2_test)) 

R2 value for train: 0.5210723066631118
R2 value for test: 0.49631211040256396


In [31]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
lin = LinearRegression()
knn = KNeighborsRegressor(n_neighbors = 3)
mlp = MLPRegressor(max_iter = 2000)

In [33]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [34]:
def model_inplace(scaler, models, X_train, X_test, y_train, y_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    scores = []
    for model in models:
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
        scores.append({'model': type(model).__name__, 'train_score': train_score, 'test_score': test_score})
    
    return pd.DataFrame(scores)

In [35]:
models = [lin, knn, mlp]
scores_df = model_inplace(scaler, models, X_train, X_test, y_train, y_test)


In [37]:
scores_df.sort_values(by='test_score', ascending=False)

Unnamed: 0,model,train_score,test_score
2,MLPRegressor,0.841199,0.776406
0,LinearRegression,0.521058,0.496613
1,KNeighborsRegressor,0.616259,0.232048


* MPLRegressor is the best model for this problem