In [325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from sklearn.metrics import r2_score

In [326]:
data = pd.read_csv('marketing_customer_analysis.csv')

In [330]:
def clean (data): 
    data.columns = data.columns.str.lower()

    nulls = pd.DataFrame(data.isna().sum() / len(data)).reset_index()
    nulls.columns = ['column_name', 'percent_nulls']
    todrop = nulls[nulls['percent_nulls'] >= 0.6]['column_name'].values
    data.drop(todrop, axis=1, inplace=True)
    data.dropna(inplace=True)

    data.drop(['effective to date'], axis=1, inplace=True)
    data.drop(['customer'], axis=1, inplace=True)

    sns.heatmap(data.corr(), annot=True, annot_kws={"size": 6})
    plt.show()

    numerical = data[['months since policy inception', 'customer lifetime value', 'income', 'monthly premium auto', 'months since last claim', 'months since policy inception']]
    categorical = data[['coverage', 'gender', 'location code', 'sales channel', 'vehicle class', 'vehicle size']]
    hybrid = data[['number of open complaints', 'number of policies', 'total claim amount']]

    arr, lmbda = stats.boxcox(numerical['customer lifetime value'])
    numerical['customer lifetime value'] = arr

    numerical['income'] = np.where(numerical['income'] < 0, 0, numerical['income'])
    numerical['income'] = numerical['income'].replace(0, numerical['income'].mean())

    arr, lmbda = stats.boxcox(numerical['income'])
    numerical['income'] = arr

    arr, lmbda = stats.boxcox(numerical['monthly premium auto'])
    numerical['monthly premium auto'] = arr

    numerical['months since last claim'].fillna(0, inplace=True)
    numerical['months since last claim'] = np.where(numerical['months since last claim'] < 0, 0, numerical['months since last claim'])
    numerical['months since last claim'] = numerical['months since last claim'].replace(0, numerical['months since last claim'].mean())
    arr, lmbda = stats.boxcox(numerical['months since last claim'])
    numerical['months since last claim'] = arr

    numerical['months since policy inception'] = np.where(numerical['months since policy inception'] < 0, 0, numerical['months since policy inception'])
    numerical['months since policy inception'] = numerical['months since policy inception'].replace(0, numerical['months since policy inception'].mean())
    arr, lmbda = stats.boxcox(numerical['months since policy inception'])
    numerical['months since policy inception'] = arr

    hybrid['number of open complaints'] = LabelEncoder().fit_transform(hybrid['number of open complaints'])
    hybrid['number of policies'] = LabelEncoder().fit_transform(hybrid['number of policies'])
    
    final = pd.DataFrame(np.concatenate((numerical,categorical,hybrid),axis=1))
    
    X = final.drop(['total claim amount'],axis=1)
    Y = final['total claim amount']  
    
    X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.4,random_state=100)

    model = sm.OLS(Y_train, X_train).fit()
    
    predictions = model.predict(X_test)

    r2_score(Y_test,predictions)

    mae = mean_absolute_error(Y_test, predictions)

    mse=mean_squared_error(Y_test,predictions)

    return r2_score
    