In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [76]:
num_df = pd.read_csv("/Users/giacomorossini/Desktop/IRONHACK/Laboratory/LAB11.1/lab-cleaning-numerical-data/numerical_df.csv")

In [77]:
cat_df = pd.read_csv("/Users/giacomorossini/Desktop/IRONHACK/Laboratory/LAB11.2/lab-cleaning-categorical-data/categorical_df_encod.csv")

In [78]:
def transformation (df):
    cols=[]
    for i in range(len(df.columns)):
        cols.append(df.columns[i].lower().replace(' ','_'))
    df.columns=cols
    
transformation(cat_df)
transformation(num_df)

In [80]:
cat_df['effective_to_date']=pd.to_datetime(cat_df['effective_to_date'])

In [84]:
data_tot=num_df.join(cat_df)

In [96]:
def outliers_iqr(data, columns, threshold=1.5):
    df_adj = data.copy()
    for column in columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        df_adj = df_adj[(df_adj[column] >= lower_bound) & (df_adj[column] <= upper_bound)]

    return df_adj


clean_no_outl = outliers_iqr(data_tot, ['total_claim_amount', 'income','customer_lifetime_value','monthly_premium_auto'], threshold=1.5).reset_index(drop = True)

In [97]:
# 2. Create a copy of the dataframe for the data wrangling.
data_clean = clean_no_outl.copy()

In [100]:
def column_types(df):
    num_col = df.select_dtypes(include=['int', 'float', 'datetime64']).columns
    cat_col = df.select_dtypes(include=['object']).columns
    return num_col, cat_col

numerical_columns, categorical_columns = column_types(data_clean)

numerical_df = data_clean[numerical_columns]
categorical_df = data_clean[categorical_columns]

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,state_arizona,state_california,...,vehicle_class_four-door_car,vehicle_class_luxury_car,vehicle_class_luxury_suv,vehicle_class_suv,vehicle_class_sports_car,vehicle_class_two-door_car,vehicle_size_large,vehicle_size_medsize,vehicle_size_small,effective_to_date
0,2763.519279,56274,69,32,5,0,1,384.811147,0,0,...,0,0,0,0,0,1,0,1,0,2011-02-24
1,12887.431650,48767,108,18,38,0,2,566.472247,0,0,...,0,0,0,0,0,1,0,1,0,2011-02-19
2,7645.861827,0,106,18,65,0,7,529.881344,0,1,...,0,0,0,1,0,0,0,1,0,2011-01-20
3,2813.692575,43836,73,12,44,0,1,138.130879,0,0,...,1,0,0,0,0,0,0,1,0,2011-02-03
4,8256.297800,62902,69,14,94,0,2,159.383042,0,0,...,0,0,0,0,0,1,0,1,0,2011-01-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7842,4100.398533,47761,104,16,58,0,1,541.282007,0,1,...,1,0,0,0,0,0,1,0,0,2011-01-06
7843,3096.511217,21604,79,14,28,0,1,379.200000,0,1,...,1,0,0,0,0,0,0,1,0,2011-02-12
7844,8163.890428,0,85,9,37,3,2,790.784983,0,1,...,1,0,0,0,0,0,0,1,0,2011-02-06
7845,7524.442436,21941,96,34,3,0,3,691.200000,0,1,...,1,0,0,0,0,0,1,0,0,2011-02-03


In [101]:
for column in data_clean.columns:
    unique = len(set(data_clean[column]))
    print(column,':',unique)

customer_lifetime_value : 6890
income : 4983
monthly_premium_auto : 104
months_since_last_claim : 36
months_since_policy_inception : 100
number_of_open_complaints : 6
number_of_policies : 9
total_claim_amount : 4288
state_arizona : 2
state_california : 2
state_nevada : 2
state_oregon : 2
state_washington : 2
response_no : 2
response_yes : 2
coverage_basic : 2
coverage_extended : 2
coverage_premium : 2
education_bachelor : 2
education_college : 2
education_doctor : 2
education_high_school_or_below : 2
education_master : 2
employmentstatus_disabled : 2
employmentstatus_employed : 2
employmentstatus_medical_leave : 2
employmentstatus_retired : 2
employmentstatus_unemployed : 2
gender_f : 2
gender_m : 2
location_code_rural : 2
location_code_suburban : 2
location_code_urban : 2
marital_status_divorced : 2
marital_status_married : 2
marital_status_single : 2
policy_corporate_l1 : 2
policy_corporate_l2 : 2
policy_corporate_l3 : 2
policy_personal_l1 : 2
policy_personal_l2 : 2
policy_personal_l

In [102]:
def differentiate_features(df):
    continuous_vars = []
    discrete_vars = []
    for column in df:
        threshold = 100 #arbitrary selected
        if df[column].nunique() > threshold: #counting unique by condition
            continuous_vars.append(column)
        else:
            discrete_vars.append(column)
    return df[continuous_vars], df[discrete_vars]

# Store continuous data into a continuous_df variable and do the same for discrete_df
continuous_df, discrete_df = differentiate_features(data_clean)

In [109]:
# 3. Normalize the continuous variables. You can use any one method you want.

scaler = MinMaxScaler() 
normalized_data = scaler.fit_transform(continuous_df) 
normalized_data = pd.DataFrame(normalized_data, columns=continuous_df.columns) #trasformed in a dataframe
final_df=discrete_df.join(normalized_data)
final_df

Unnamed: 0,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,state_arizona,state_california,state_nevada,state_oregon,state_washington,response_no,...,vehicle_class_sports_car,vehicle_class_two-door_car,vehicle_size_large,vehicle_size_medsize,vehicle_size_small,effective_to_date,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,32,5,0,1,0,0,0,0,1,1,...,0,1,0,1,0,2011-02-24,0.059651,0.562847,0.073394,0.400735
1,18,38,0,2,0,0,1,0,0,1,...,0,1,0,1,0,2011-02-19,0.757387,0.487763,0.431193,0.589962
2,18,65,0,7,0,1,0,0,0,1,...,0,0,0,1,0,2011-01-20,0.396140,0.000000,0.412844,0.551847
3,12,44,0,1,0,0,0,0,1,1,...,0,0,0,1,0,2011-02-03,0.063109,0.438443,0.110092,0.143781
4,14,94,0,2,0,0,0,1,0,0,...,0,1,0,1,0,2011-01-25,0.438211,0.629140,0.073394,0.165918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7842,16,58,0,1,0,1,0,0,0,1,...,0,0,1,0,0,2011-01-06,0.151788,0.477701,0.394495,0.563723
7843,14,28,0,1,0,1,0,0,0,0,...,0,0,0,1,0,2011-02-12,0.082600,0.216081,0.165138,0.394890
7844,9,37,3,2,0,1,0,0,0,1,...,0,0,0,1,0,2011-02-06,0.431842,0.000000,0.220183,0.823617
7845,34,3,0,3,0,1,0,0,0,1,...,0,0,1,0,0,2011-02-03,0.387772,0.219452,0.321101,0.719885


In [None]:
#4. Encode the categorical variables: datas are already normalized and categorical encoded 

In [126]:
#5. The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.

final_df['month']= final_df['effective_to_date'].dt.month
final_df['day']= final_df['effective_to_date'].dt.day
final_df['year']= final_df['effective_to_date'].dt.year

final_df = final_df.drop(['effective_to_date'], axis =1)

In [133]:
final_df.dtypes

months_since_last_claim            int64
months_since_policy_inception      int64
number_of_open_complaints          int64
number_of_policies                 int64
state_arizona                      int64
                                  ...   
monthly_premium_auto             float64
total_claim_amount               float64
month                              int64
day                                int64
year                               int64
Length: 61, dtype: object

In [153]:
# 6. Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

def check_numeric(dataframe):
    non_numeric = dataframe.select_dtypes(exclude=['int', 'float'])
    if non_numeric.empty:
        return []
    else:
        return non_numeric.columns.tolist()
non_numeric = check_numeric(final_df)
if non_numeric:
    print("Non-numeric columns found:")
for column in non_numeric:
    print(column)
else:
    print("columns numeric:", len(final_df.columns))

columns numeric: 61
