# Import libraries

In [None]:
import pandas as pd  # keywords are highlighted in green, other strings in red, etc.
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#for plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#for regression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler # do not use the function Normalise() - it does something entirely different
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
#categorical Variables
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

# Round 2 - Dealing with the data

In [None]:
#data_mca.to_csv('./marketing_customer_analysis.csv', index=False) #without indexing column!
data_mca = pd.read_csv('./marketing_customer_analysis.csv')
data_mca.shape

data_mca = data_mca.drop(['Unnamed: 0'], axis=1)
data_mca.head()

### Round 2 - #1
Show the dataframe shape

In [None]:
data_mca.shape

In [None]:
data_mca.columns

### Round 2 - #2
Standardize header names

In [None]:
#Headers without space
data_mca = data_mca.rename(columns={'EmploymentStatus':'Employment Status'})

In [None]:
#Headers all on lower case
mca = []
for mcaname in data_mca.columns:
    mca.append(mcaname.lower())
data_mca.columns = mca

In [None]:
#Headers replace the space by "_"
data_mca.columns = data_mca.columns.str.replace(' ', '_')
data_mca.columns

### Round 2 - #3 , #4
Which columns are numerical? Which columns are categorical?

In [None]:
data_mca.dtypes

### Round 2 - #5 
Check and deal with NaN values.

In [None]:
#Check for duplicate rows in the data and remove if any.
sum(data_mca.duplicated()) #check how many rows are duplicated.
data_mca = data_mca.drop_duplicates()
data_mca.isna().sum()

In [None]:
data_mca_percent = pd.DataFrame(round(data_mca.isna().sum()/len(data_mca),4)*100)
# nulls_df = nulls_df.reset_index()
# nulls_df.columns = ['header_name', 'percent_nulls']
# nulls_df
data_mca_percent

In [None]:
# # strategy: drop rows that have null values (only if there are very few)
data_mca[data_mca['state'].isna()==True].head(60)

#We can see that 'state' and 'response' seem to be missing in the same rows. --> drop the rows.

data_mca = data_mca[data_mca['state'].isna()==False]
data_mca
data_mca.isna().sum()
data_mca.shape

In [None]:
data_mca[data_mca['months_since_last_claim'].isna()==True].head(60)

#We can see that 'months_since_last_claim' and 'number_of_open_complaints' seem to be missing in the same rows. --> drop the rows.
data_mca = data_mca[data_mca['months_since_last_claim'].isna()==False]
data_mca
data_mca.isna().sum()
#data_mca1.shape

In [None]:
# vehicle_class
data_mca[data_mca['vehicle_class'].isna()==True].head(60)
data_mca['vehicle_class'].unique()
data_mca['vehicle_class'].mode()

data_mca['vehicle_class'].value_counts(dropna=True)
data_mca['vehicle_class'] = data_mca['vehicle_class'].fillna(data_mca['vehicle_class'].value_counts(dropna=True).index[0])

data_mca.isna().sum()

#merged_clean_ver1['state'] = merged_clean_ver1['state'].fillna(merged_clean_ver1['state'].value_counts(dropna=True).index[0])

In [None]:
# vehicle_size
data_mca[data_mca['vehicle_size'].isna()==True].head(60)
data_mca['vehicle_size'].unique()
data_mca['vehicle_size'].mode()

# --> fill it with the mean
data_mca['vehicle_size'].value_counts(dropna=True)
data_mca['vehicle_size'] = data_mca['vehicle_size'].fillna(data_mca['vehicle_size'].value_counts(dropna=True).index[0])

data_mca.isna().sum()

In [None]:
# vehicle_type --> nearly 50% is missing ! 
# strategy: input a value for the missing value (fill in a value that we choose or calculate)

data_mca.describe(include=[np.object]).T #--> only 1 unique value and NaN!
# data_mca[data_mca['vehicle_type'].isna()==True].head(60)
data_mca['vehicle_type'].unique()

# Assumption, that we have A = Automatic car and missing so "not Automatic", otherwise we could just drop the column.
data_mca['vehicle_type'] = data_mca['vehicle_type'].fillna('not A') 
data_mca.isna().sum()

### Round 2 - #6
Datetime format - Extract the months from the dataset and store in a separate column. 
Then filter the data to show only the information for the first quarter , 
ie. January, February and March. Hint: If data from March does not exist, consider only January and February.

In [None]:
data_mca

In [None]:
data_mca.columns

In [None]:
#where is a date / month?
data_mca.dtypes #no "date" there

data_mca['effective_to_date'] = pd.to_datetime(data_mca['effective_to_date'], errors='coerce')
data_mca.head()

In [None]:
data_mca['effective_to_date_month'] = data_mca['effective_to_date'].dt.month
data_mca

In [None]:
data_mca['effective_to_date_month'].unique() # --> irritating, this means we have only information for Q1...

data_mca_Q1 = data_mca[(data_mca['effective_to_date_month']==1) | (data_mca['effective_to_date_month']==2) | (data_mca['effective_to_date_month']==3)]
data_mca_Q1

### Round 2 - #7
#BONUS: Put all the previously mentioned data transformations into a function.

In [None]:
# Now define a function to clean the dataframe

def clean_dataframe(x):
    ## Standardize Headers
    #Drop the first "Unnamed: 0" column
    if 'Unnamed: 0' in x.columns:
        x = x.drop(['Unnamed: 0'], axis=1)
    else:
        x = x
    #Headers all on lower case and replace the space by "_"
    header = []
    for item in x.columns:
        header.append(item.lower().replace(' ', '_'))
    x.columns = header
    #Headers without space
    x = x.rename(columns={'EmploymentStatus':'Employment Status'})
    
    ## Check for duplicates
    x = x.drop_duplicates()
    
    ## taking care of the NAN-Values:
    #droping the NAN-Values for 'state' and 'response'
    x = x[x['state'].isna()==False]
    #droping the NAN-Values for 'months_since_last_claim' and 'number_of_open_complaints'
    x = x[x['months_since_last_claim'].isna()==False]
    #fill the 'vehicle_class' with the mean value
    x['vehicle_class'] = x['vehicle_class'].fillna(x['vehicle_class'].value_counts(dropna=True).index[0])
    #fill the 'vehicle_size' with the mean value
    x['vehicle_size'] = x['vehicle_size'].fillna(x['vehicle_size'].value_counts(dropna=True).index[0])
    #fill the NAN of vehicle_type with 'not A', because 50%
    x['vehicle_type'] = x['vehicle_type'].fillna('not A')
    #
    x['effective_to_date'] = pd.to_datetime(x['effective_to_date'], errors='coerce')
    x['effective_to_date_month'] = x['effective_to_date'].dt.month
    return x

# Round 3 - EDA (Exploratory Data Analysis)

###  Round 3 - #1 , #2
Show DataFrame info, Describe Data Frame

In [None]:
# describing the numerical columns
summary_numerical = data_mca.describe().T 
summary_numerical

#DataFrame info
info_data = data_mca.info()
info_data

# additionnaly we add the range and the iqr
summary_numerical['iqr'] = summary_numerical['75%']-summary_numerical['25%']

summary_numerical['range'] = summary_numerical['max']-summary_numerical['min']
summary_numerical

#round up the values to 2 decimals (round() would round to 0 decimals) --> with function
def roundforme(x):
    return round(x,2)

#to round the whole table..
for col in summary_numerical.columns:
    summary_numerical[col] = summary_numerical[col].apply(roundforme)
summary_numerical

In [None]:
# describing the object columns
summary_objects = data_mca.describe(include=[np.object]).T
summary_objects

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

###  Round 3 - #3
Show a plot of the total number of responses.

In [None]:
sns.countplot(x=data_mca['response'])
plt.ylabel('Total number of responses')
plt.show()

###  Round 3 - #4
Show a plot of the response rate by the sales channel.

In [None]:
import matplotlib.pyplot as plt
data_mca['numerical_response'] = data_mca['response'].map(dict(Yes=1,No=0))*100 #in %!
data_mca

sns.barplot(x="sales_channel", y="numerical_response", data=data_mca)

plt.xlabel("Sales Channel")
plt.ylabel("Numerical Response in [%]")
plt.title("Response rate by Sales Channel") # You can comment this line out if you don't need title
#axes.set(ylim=(0, 100))
plt.show()


###  Round 3 - #5
Show a plot of the response rate by the total claim amount.

In [None]:
#perform data binning on points variable with specific quantiles and labels
data_mca['claims_bin'] = pd.qcut(data_mca['total_claim_amount'], q=10)
data_mca

sns.barplot(x = data_mca['total_claim_amount'],y=data_mca['claims_bin'] ,data=data_mca)
plt.show()

###  Round 3 - #6
Show a plot of the response rate by income

In [None]:
data_mca['income_bin'] = pd.qcut(data_mca['income'], q=10, duplicates='drop')
# data_mca['income_bin'] = numpy.round_(data_mca['income_bin'], decimals = 0, out = None)
# data_mca['income_bin'] = [ round(elem, 2) for elem in data_mca['income_bin'] ]
data_mca

sns.barplot(x = data_mca['income'],y=data_mca['income_bin'] ,data=data_mca)
plt.show()

# Round 4 - Processing Data

###  Round 4 - #1
Check the data types of the columns. Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals. (You can use np.number and np.object to select the numerical data types and categorical data types respectively)

In [None]:
# data types
data_mca.dtypes

categoricals = data_mca.select_dtypes(include=np.object)
categoricals

In [None]:
numerical = data_mca.select_dtypes(include=np.number)
numerical

###  Round 4 - #2
Now we will try to check the normality of the numerical variables visually.

#### Round 4 - #2.1
Use seaborn library to construct distribution plots for the numerical variables

In [None]:
numerical.columns

In [None]:
for i in numerical.columns:
    sns.distplot(numerical,x=numerical[i])
    plt.xlabel(i)
    plt.ylabel('Distribution')
    plt.show()

#### Round 4 - #2.2
Use Matplotlib to construct histograms

In [None]:
# https://matplotlib.org/stable/gallery/pyplots/pyplot_text.html#sphx-glr-gallery-pyplots-pyplot-text-py

for i in numerical.columns:
    plt.hist(numerical[i], bins=20)
    plt.xlabel(i)
    plt.ylabel('Frequency')
    plt.show()


#### Round 4 - #2.3
Do the distributions for different numerical variables do look like a normal distribution?

The Distribution of the Customer Lifetime Value looks normally distributed. 
Total Claim Amount looks somewhat normally distributed

#### Round 4 - #3 - #4
For the numerical variables, check the multicollinearity between the features. Please note that we will use the column total_claim_amount later as the target variable.

Drop one of the two features that show a high correlation between them (greater than 0.9). Write code for both the correlation matrix and for seaborn heatmap. If there is no pair of features that have a high correlation, then do not drop any features

In [None]:
correlations_matrix = numerical.corr()
correlations_matrix

In [None]:
#heatmap
sns.heatmap(correlations_matrix, annot=True)
plt.show()

In [None]:
# 0.9 and 1.0 very highly correlated
# 0.7 and 0.9 highly correlated
# 0.5 and 0.7 moderately correlated
# 0.3 and 0.5 low correlation
# less than 0.3 little if any (linear) correlation. 

# Target Value: Total Claim Amount 
# is moderately correlated with monthly_premium_auto (0.63)
# is lowly correlated with income (-0.35)

# There is no correlation between monthly premium auto and income (-0.0014)

In [None]:
# reduced_numerical = numerical[['total_claim_amount', 'income', 'monthly_premium_auto']]
# reduced_numerical.columns

# Round 5 - Processing Data

#### Round 5 - #1 X-y split

In [None]:
y = data_mca['total_claim_amount']
X = data_mca.drop(['total_claim_amount'], axis=1)
X.head()
X.shape
display(X)

In [None]:
#Dropping useless columns
X = X.drop(['effective_to_date_month', 'numerical_response', 'customer','effective_to_date','income_bin','claims_bin'],axis=1)
X.head()
X.shape

In [None]:
X.dtypes

In [None]:
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)

In [None]:
X_num.shape

In [None]:
X_cat.shape

#### Round 5 - #2 Normalize (numerical)

Normalization or Min-Max Scaling is used to transform features to be on a similar scale. 

The new point is calculated as:
X_new = (X - X_min)/(X_max - X_min)

This scales the range to [0, 1] or sometimes [-1, 1]. 
Geometrically speaking, transformation squishes the n-dimensional data into an n-dimensional 
unit hypercube. Normalization is useful when there are no outliers as it cannot cope up with 
them. Usually, we would scale age and not incomes because only a few people have high incomes 
but the age is close to uniform.

In [None]:
# This is the code for the MIN-MAX-Normalizing, in this lab I chose to do the Standard Normalizing, 
# because it takes into account more the outliers.

# Normalizing data: make data range from 0 - 1, instead of from min to max
transformer = MinMaxScaler().fit(X_num)
X_num_normalized = transformer.transform(X_num)
print(X_num_normalized.shape)

X_num_normalized=pd.DataFrame(X_num_normalized, columns=X_num.columns)
X_num_normalized

Standardization or Z-Score Normalization is the transformation of features 
by subtracting from mean and dividing by standard deviation. This is often called as Z-score.

X_new = (X - mean)/Std

Standardization can be helpful in cases where the data follows a Gaussian distribution. 
However, this does not have to be necessarily true. Geometrically speaking, it translates 
the data to the mean vector of original data to the origin and squishes or expands the 
points if std is 1 respectively. We can see that we are just changing mean and standard 
deviation to a standard normal distribution which is still normal thus the shape of the 
distribution is not affected.

Standardization does not get affected by outliers because there is no predefined range of transformed features.

In [None]:
# # Standardization or Z-Score Normalization
# transformer = StandardScaler().fit(X_num)
# X_num_normalized = transformer.transform(X_num)
# print(X_num_normalized.shape)

# X_num_normalized= pd.DataFrame(X_num_normalized, columns=X_num.columns)
# X_num_normalized

# Round 6 - Processing Data, Linear Regression, Model Validation

#### Round 6 - Processing Data - #1 One Hot/Label Encoding (categorical).

In [None]:
X_cat.head()

In [None]:
#one hot encoding is a way to turn categorical variables into multiple numerical columns
# from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_cat) # the first one is the most frequent one.
#print(encoder.categories_)
encoded = encoder.transform(X_cat).toarray()
#print(encoded)

cols = encoder.get_feature_names(input_features=X_cat.columns)
cols

onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()
onehot_encoded.shape

#### Round 6 - Processing Data - #2 Concat DataFrames

In [None]:
X = pd.concat([X_num_normalized, onehot_encoded], axis=1)
X

#### Round 6 - Linear Regression - #3 Train-test split.

In [None]:
y = y.reset_index(drop=True)
#onehot_encoded=onehot_encoded.reset_index(drop=True)
X = X.reset_index(drop=True)

In [None]:
###
# 0.9 and 1.0 very highly correlated
# 0.7 and 0.9 highly correlated
# 0.5 and 0.7 moderately correlated
# 0.3 and 0.5 low correlation
# less than 0.3 little if any (linear) correlation. 
###

### Now we do the correlation for the categoricals:

Collinearity = pd.concat((y, X), axis=1)
Collinearity

corr_matrix_cat = Collinearity.corr()
corr_matrix_cat #44x44-matrix!
#print(corr_matrix_cat.iloc[0])

filtered_matrix = corr_matrix_cat[((corr_matrix_cat >= .3) | (corr_matrix_cat <= -.3))]
filtered_matrix
print(filtered_matrix.iloc[0])

plt.figure(figsize=(30,10))
sns.heatmap(filtered_matrix, annot=True, cmap="Reds")
plt.show()

### From numericals:
# Target Value: Total Claim Amount 
# is moderately correlated with monthly_premium_auto (0.63)
# is lowly correlated with income (-0.35)
# There is no correlation between monthly premium auto and income (-0.0014)

### Result from categoricals:
# Target Value: Total Claim Amount
# is lowly correlated with employment_status_Employed (-0.33)
# is lowly correlated with employment_status_Unemployed (-0.32)
# is moderately correlated with location_code_Suburban (0.57)
# is lowly correlated with vehicle_class_Luxury Car (0.31)
# is lowly correlated with vehicle_class_Luxury SUV (0.31)

In [None]:
## From Round 7: use the concept of multicollinearity and remove insignificant variables
# so check if there is a correlation between the 7 variables.

In [None]:
filtered_matrix['monthly_premium_auto']

#lowly correlated with "vehicle_class_Luxury Car" (0.451538)
#lowly correlated with "vehicle_class_Luxury SUV" (0.484589)

#makes sense... 

In [None]:
filtered_matrix['income']

#highly correlated with "employment_status_Employed" (0.794671)
#highly correlated with "employment_status_Unemployed" (-0.722468)
#lowly correlated with "location_code_Suburban" (-0.450730)

#--> toss the employment status Employed and Unemployed and keep only income!

In [None]:
filtered_matrix['location_code_Suburban']
#lowly correlated with "income" (-0.450730)

In [None]:
filtered_matrix['vehicle_class_Luxury Car']
#lowly correlated with "monthly_premium_auto" (0.451538)

#--> toss the vehicle_class Luxury Car/SUV

In [None]:
y # ['total_claim_amount'] from above.  
X = X[['monthly_premium_auto', 'income', 'employment_status_Employed','employment_status_Unemployed','location_code_Suburban','vehicle_class_Luxury Car', 'vehicle_class_Luxury SUV']]

# #Round 7: try without "insignificant variables" employment_status, vehicle_class Luxury Car/SUV
# X = X[['monthly_premium_auto','location_code_Suburban']]
# X

In [None]:
# We do the Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# We caculate the linear regression based on the "train"-data
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

print("b0 = " , lm.intercept_)
print("b1 = " , lm.coef_)

#### Round 6 - Linear Regression -  #4 Apply linear regression.

In [None]:
# We compare the predicted y-data (applied linear regression on x_train) and compare it with our y_train data.
predictions = lm.predict(X_train)
r2_score(y_train, predictions)
# our r2 score is good.

#### Round 6 - Model Validation -  #5 Description R2

In [None]:
# We apply the linear regression on the x-test and compare it with our y-test
predictions_test = lm.predict(X_test)
R2 = r2_score(y_test, predictions_test)
print("R2-score is ", R2)
# our r2 score is even better, why ? This shouldn't be the case.. 

In [None]:
y_test[:5]

In [None]:
predictions_test[:5]

#### Round 6 - Model Validation -  #6 Description MSE

In [None]:
mse=mean_squared_error(y_test,predictions_test)
mse
print("mean squared error (MSE) is ", mse)

# --> the mean squared error is the error ^2 so the error seems extremely large! that is why to have a 
# better understanding/relation with the data, the RMSE is more "relatable"

#Round 6: 21780.55771867215
#Round 7 without Employment Status: 22531.810977831443
#Round 7 without Employment Status + Luxury Car/SUV: 22641.020956354456
#Round 7 only 'monthly_premium_auto','location_code_Suburban': 23960.230603560052
#Round 7 with MIN-MAX-Scale: 21780.55771867215 --> no difference.. ? why ?
#Round 7 with 0.15: 20785.98606442319 !!! Better !!!
#Round 7 with 0.25: 22890.035134861595, worse...
#Round 7 with 0.10: 21348.528873555086 !!! Better !!!
#Round 7 with 0.05: 20300.42282084946 !!! Better !!!

# --> by removing some variables, the error couldn't be made better..
# --> we keep all the variables

#### Round 6 - Model Validation -  #7 Description RMSE

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse
print("root mean squared error (RMSE) is ", rmse)

# --> the RMSE is compared to the five couples (y_test[:5] and predictions_test[:5]) the triple than expected..

#Round 6: 147.58237604359184
#Round 7 without Employment Status: 150.10599914004584 --> not very helpful...
#Round 7 without Employment Status + Luxury Car/SUV: 150.4693356014921
#Round 7 only 'monthly_premium_auto','location_code_Suburban': 154.7909254561134
#Round 7 with MIN-MAX-Scale: 147.58237604359184 --> no difference.. ? why ?


In [None]:
#Have a look at another sample, to get an impression of the error...
abs(y_test[50:60]-predictions_test[50:60])

#### Round 6 - Model Validation -  #8 Description MAE

In [None]:
mae = mean_absolute_error(y_test, predictions_test)
print("mean absolute error (MAE) is ", mae)

#Round 6: 109.21649592507937
#Round 7 without Employment Status: 113.20024356020065
#Round 7 without Employment Status + Luxury Car/SUV: 113.62662388135328
#Round 7 only 'monthly_premium_auto','location_code_Suburban': 118.55317357733347
#Round 7 with MIN-MAX-Scale: 109.21649592507936 --> no difference.. ? why ?

# Round 7

Build a function, from round 2 and round 7, to clean and process the data.

In [None]:
# Now define a function to clean the dataframe

def clean_and_process(x):
    ## Round 2
    ## Standardize Headers
    #Drop the first "Unnamed: 0" column
    if 'Unnamed: 0' in x.columns:
        x = x.drop(['Unnamed: 0'], axis=1)
    else:
        x = x
    #Headers all on lower case and replace the space by "_"
    header = []
    for item in x.columns:
        header.append(item.lower().replace(' ', '_'))
    x.columns = header
    #Headers without space
    x = x.rename(columns={'EmploymentStatus':'Employment Status'})
    
    ## Check for duplicates
    x = x.drop_duplicates()
    
    ## taking care of the NAN-Values:
    #droping the NAN-Values for 'state' and 'response'
    x = x[x['state'].isna()==False]
    #droping the NAN-Values for 'months_since_last_claim' and 'number_of_open_complaints'
    x = x[x['months_since_last_claim'].isna()==False]
    #fill the 'vehicle_class' with the mean value
    x['vehicle_class'] = x['vehicle_class'].fillna(x['vehicle_class'].value_counts(dropna=True).index[0])
    #fill the 'vehicle_size' with the mean value
    x['vehicle_size'] = x['vehicle_size'].fillna(x['vehicle_size'].value_counts(dropna=True).index[0])
    #fill the NAN of vehicle_type with 'not A', because 50%
    x['vehicle_type'] = x['vehicle_type'].fillna('not A')
    #
    x['effective_to_date'] = pd.to_datetime(x['effective_to_date'], errors='coerce')
    x['effective_to_date_month'] = x['effective_to_date'].dt.month
    
    ## Round 3
    return x