![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Data cleaning and wrangling

For this lab, we will be using the same dataset we used in the previous labs. We recommend using the same notebook since you will be reusing the same variables you previous created and used in labs. 

### Instructions

So far we have worked on `EDA`. This lab will focus on data cleaning and wrangling from everything we noticed before.

1. We will start with removing outliers. So far, we have discussed different methods to remove outliers. Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the dataframe.
2. Create a copy of the dataframe for the data wrangling.
3. Normalize the continuous variables. You can use any one method you want.
4. Encode the categorical variables
5. The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.
6. Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

**Hint for Categorical Variables**

- You should deal with the categorical variables as shown below (for ordinal encoding, dummy code has been provided as well):

```python
# One hot to state
# Ordinal to coverage
# Ordinal to employmentstatus
# Ordinal to location code
# One hot to marital status
# One hot to policy type
# One hot to policy
# One hot to renew offercustomer_df
# One hot to sales channel
# One hot vehicle class
# Ordinal vehicle size

data["coverage"] = data["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
# given that column "coverage" in the dataframe "data" has three categories:
# "basic", "extended", and "premium" and values are to be represented in the same order.
```


***

In [67]:
import pandas as pd
import numpy as np
import snakecase
import stringcase
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

***

In [121]:
numerical = pd.read_csv('numericalsfinal.csv')
numerical.drop(['Unnamed: 0'],axis=1,inplace=True)
categorical = pd.read_csv('categoricalsfinal.csv')
categorical.drop(['Unnamed: 0','alpha_customer','customer'],axis=1,inplace=True)
data = pd.concat([numerical,categorical],axis=1)
data

Unnamed: 0,customer__lifetime__value,income,monthly__premium__auto,months__since__last__claim,months__since__policy__inception,number_of__open__complaints,number_of__policies,total__claim__amount,state,response,...,marital_status,policy_type,policy_n,renew_offer_type,sales_channel,vehicle_class,vehicle_size,effective_to_date,year_effective_to_date,month_effective_to_date
0,2763.519279,56274,69,32,5,0,1,384.811147,Washington,No,...,Married,Corporate Auto,3,1,Agent,Two-Door Car,Medsize,2011-02-24,2011,2
1,6979.535903,0,94,13,42,0,8,1131.464935,Arizona,No,...,Single,Personal Auto,3,3,Agent,Four-Door Car,Medsize,2011-01-31,2011,1
2,12887.431650,48767,108,18,38,0,2,566.472247,Nevada,No,...,Married,Personal Auto,3,1,Agent,Two-Door Car,Medsize,2011-02-19,2011,2
3,7645.861827,0,106,18,65,0,7,529.881344,California,No,...,Married,Corporate Auto,2,1,Call Center,SUV,Medsize,2011-01-20,2011,1
4,2813.692575,43836,73,12,44,0,1,138.130879,Washington,No,...,Single,Personal Auto,1,1,Agent,Four-Door Car,Medsize,2011-02-03,2011,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,0,2,198.234764,California,No,...,Married,Personal Auto,1,2,Web,Four-Door Car,Medsize,2011-02-10,2011,2
9130,3096.511217,21604,79,14,28,0,1,379.200000,California,Yes,...,Divorced,Corporate Auto,3,1,Branch,Four-Door Car,Medsize,2011-02-12,2011,2
9131,8163.890428,0,85,9,37,3,2,790.784983,California,No,...,Single,Corporate Auto,2,1,Branch,Four-Door Car,Medsize,2011-02-06,2011,2
9132,7524.442436,21941,96,34,3,0,3,691.200000,California,No,...,Married,Personal Auto,2,3,Branch,Four-Door Car,Large,2011-02-03,2011,2


***
#### 3. Normalize the continuous variables. You can use any one method you want.

In [122]:
numerical_df =data._get_numeric_data()

In [123]:
def differentiate_variables(df):
    continuous_cols = []
    discrete_cols = []

    for col in numerical_df.columns:
        if numerical_df[col].nunique() <= 20:
            discrete_cols.append(col)
        else:
            continuous_cols.append(col)

    continuous_df = df[continuous_cols]
    discrete_df = df[discrete_cols]

    return continuous_df, discrete_df
continuous_df, discrete_df = differentiate_variables(numerical_df)
continuous_df

Unnamed: 0,customer__lifetime__value,income,monthly__premium__auto,months__since__last__claim,months__since__policy__inception,total__claim__amount
0,2763.519279,56274,69,32,5,384.811147
1,6979.535903,0,94,13,42,1131.464935
2,12887.431650,48767,108,18,38,566.472247
3,7645.861827,0,106,18,65,529.881344
4,2813.692575,43836,73,12,44,138.130879
...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,198.234764
9130,3096.511217,21604,79,14,28,379.200000
9131,8163.890428,0,85,9,37,790.784983
9132,7524.442436,21941,96,34,3,691.200000


In [None]:
from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(continuous_df)
continuous_df = transformer.transform(continuous_df)
continuous_df = pd.DataFrame(continuous_df)
continuous_df

***
### 4.Encode the categorical variables

In [124]:
from sklearn.preprocessing import OneHotEncoder
categorical_encoded = pd.DataFrame(OneHotEncoder().fit(pd.DataFrame(categorical)).transform(pd.DataFrame(categorical)).toarray())
categorical_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9130,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9131,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9132,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


***
#### 5. The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.

it is already done

***
#### 6. Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

In [125]:
data = pd.concat([continuous_df,discrete_df,categorical_encoded],axis=1)
data

Unnamed: 0,customer__lifetime__value,income,monthly__premium__auto,months__since__last__claim,months__since__policy__inception,total__claim__amount,number_of__open__complaints,number_of__policies,policy_n,renew_offer_type,...,103,104,105,106,107,108,109,110,111,112
0,2763.519279,56274,69,32,5,384.811147,0,1,3,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,6979.535903,0,94,13,42,1131.464935,0,8,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,12887.431650,48767,108,18,38,566.472247,0,2,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,7645.861827,0,106,18,65,529.881344,0,7,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2813.692575,43836,73,12,44,138.130879,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,198.234764,0,2,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9130,3096.511217,21604,79,14,28,379.200000,0,1,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9131,8163.890428,0,85,9,37,790.784983,3,2,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9132,7524.442436,21941,96,34,3,691.200000,0,3,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### 1. We will start with removing outliers. So far, we have discussed different methods to remove outliers. Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the dataframe.

In [127]:
def remove_outliers (data):
    iqr_clv = np.percentile(data['customer__lifetime__value'],75) - np.percentile(data['customer__lifetime__value'],25)
    upper_limit_clv = np.percentile(data['customer__lifetime__value'],75) + 1.5*iqr_clv
    lower_limit_clv = np.percentile(data['customer__lifetime__value'],25) - 1.5*iqr_clv
    data = data[data['customer__lifetime__value'] < upper_limit_clv]

    iqr_mpa = np.percentile(data['monthly__premium__auto'],75) - np.percentile(data['monthly__premium__auto'],25)
    upper_limit_mpa = np.percentile(data['monthly__premium__auto'],75) + 1.5*iqr_mpa
    lower_limit_mpa = np.percentile(data['monthly__premium__auto'],25) - 1.5*iqr_mpa
    data = data[data['monthly__premium__auto'] < upper_limit_mpa]

    iqr_tca = np.percentile(data['total__claim__amount'],75) - np.percentile(data['total__claim__amount'],25)
    upper_limit_tca = np.percentile(data['total__claim__amount'],75) + 1.5*iqr_tca
    lower_limit_tca = np.percentile(data['total__claim__amount'],25) - 1.5*iqr_tca
    data = data[data['total__claim__amount'] < upper_limit_tca]
    return data
data = remove_outliers(data)

In [143]:
data

Unnamed: 0,customer__lifetime__value,income,monthly__premium__auto,months__since__last__claim,months__since__policy__inception,total__claim__amount,number_of__open__complaints,number_of__policies,policy_n,renew_offer_type,...,103,104,105,106,107,108,109,110,111,112
0,2763.519279,56274,69,32,5,384.811147,0,1,3,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,12887.431650,48767,108,18,38,566.472247,0,2,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,7645.861827,0,106,18,65,529.881344,0,7,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2813.692575,43836,73,12,44,138.130879,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5,8256.297800,62902,69,14,94,159.383042,0,2,3,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9128,4100.398533,47761,104,16,58,541.282007,0,1,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
9130,3096.511217,21604,79,14,28,379.200000,0,1,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9131,8163.890428,0,85,9,37,790.784983,3,2,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9132,7524.442436,21941,96,34,3,691.200000,0,3,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
