In [2]:

def df_cleaning(original_df):
    '''
    The following function will execute these cleaning operations:
    
        1. Show the dataframe shape.
        2. Standardize header names.
        3. Which columns are numerical?
        4. Which columns are categorical?
        5. Check and deal with `NaN` values.
        6. Datetime format - Extract the months from the dataset and store in a separate column. Then filter the data to show only the information for the first quarter , ie. January, February and March. _Hint_: If data from March does not exist, consider only January and February.
    '''
    
    # Standardize header names.
    
    original_df = original_df.drop(['Unnamed: 0'], axis = 1) # Dropping the column 'unnamed:_0'
    
    columns = []

    for i in range(len(original_df.columns)):
        columns.append(original_df.columns[i].lower().replace(' ', '_'))

    original_df.columns = columns
    
    
    # Check and deal with `NaN` values.
    
    original_df = original_df.drop(['vehicle_type'], axis=1) # This line works only if it present the 'vehicle_type' column

    df_no_nan = original_df.dropna() # Here i am dropping all of Nan values because of the properties of this specific df

    # Date-time convertion + creating 'month' column
    
    df_no_nan['effective_to_date'] = pd.to_datetime(df_no_nan['effective_to_date'])

    df_no_nan['month'] = df_no_nan['effective_to_date'].dt.month

    cleaned_df = df_no_nan.drop('effective_to_date', axis = 1)
    
    # Show the dataframe shape.
    print('This is the df shape', cleaned_df.shape)
    
    print('\n')
    
    # Which columns are numerical/ categorical?
    
    df_numerical = cleaned_df.select_dtypes(include='number')

    columns_num = []

    for i in range(len(df_numerical.columns)):
        columns_num.append(df_numerical.columns[i])
    print('Numerical columns are: ', columns_num)

    print('\n')
    df_categorical = cleaned_df.select_dtypes(include='object')

    columns_cat = []

    for i in range(len(df_categorical.columns)):
        columns_cat.append(df_categorical.columns[i])
    print('Categorical columns are: ', columns_cat)
    
    return cleaned_df



    
    
    

In [3]:

# Same thing but with subplots

def eda_02(df):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))

    sns.histplot(df['response'], stat='count', ax=axes[0, 0])
    axes[0, 0].set_xlabel('Response')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Total number of responses')

    sns.countplot(x=df['sales_channel'], hue=df['response'], data=df, ax=axes[0, 1])
    axes[0, 1].set_xlabel('Sales channel')
    axes[0, 1].set_ylabel('Response rate')
    axes[0, 1].set_title('Response rate by the sales channel')

    bin_edges = [0, 150, 300, 450, 600, 3000]
    bin_labels = ['0-150', '151-301', '301-450', '451-600', '600+']
    df['claim_bins'] = pd.cut(df['total_claim_amount'], bins=bin_edges, labels=bin_labels)
    grouped = df.groupby('claim_bins')['response'].value_counts(normalize=True).unstack().fillna(0)
    grouped['perc_rate'] = grouped['Yes'] * 100
    grouped.reset_index(inplace=True)
    sns.barplot(x='claim_bins', y='perc_rate', data=grouped, palette="Blues", ax=axes[1, 0])
    axes[1, 0].set_xlabel('Total claim amount')
    axes[1, 0].set_ylabel('Response rate %')
    axes[1, 0].set_title('Response rate by the total claim amount')

    bin_edges = [-1, 15000, 30000, 45000, 60000, 999999999]
    bin_labels = ['0 - 15k', '15k - 30k', '30k - 45k', '45k - 60k', '60k and above']
    df['incomes_bins'] = pd.cut(df['income'], bins=bin_edges, labels=bin_labels)
    grouped = df.groupby('incomes_bins')['response'].value_counts(normalize=True).unstack()
    grouped['perc_rate'] = grouped['Yes'] * 100
    grouped.reset_index(inplace=True)
    sns.barplot(x='incomes_bins', y='perc_rate', data=grouped, palette="Blues", ax=axes[1, 1])
    axes[1, 1].set_xlabel('Income bins')
    axes[1, 1].set_ylabel('Response rate %')
    axes[1, 1].set_title('Response rate by income')

    plt.tight_layout()
    plt.show()


In [4]:
def plot_distribution(data, columns):
    '''This function automatizes the creation of the dist plots on seaborn'''
    fig, axes = plt.subplots(nrows=1, ncols=len(columns), figsize=(16, 4))
    for i, column in enumerate(columns):
        sns.distplot(data[column], ax=axes[i])
        axes[i].set_xlabel(column.capitalize().replace('_', ' '))
    plt.tight_layout()
    plt.show()

    '''This automatizes the creation of the hist plots on matplotlib'''
    fig, axes = plt.subplots(nrows=1, ncols=len(columns), figsize=(16, 4))
    for i, column in enumerate(columns):
        axes[i].hist(data[column])
        axes[i].set_xlabel(column.capitalize().replace('_', ' '))
        axes[i].set_ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    