In [1]:
import polars as pl
import plotly.express as px
from scipy import stats

In [2]:
schema = {
    "id": pl.UInt32,
    "age": pl.UInt32,
    "job": pl.String,
    "marital": pl.String,
    "education": pl.String,
    "default": pl.String,
    "housing": pl.String,
    "loan": pl.String,
    "contact": pl.String,
    "month": pl.String,
    "day_of_week": pl.String,
    "duration": pl.UInt32,
    "campaign": pl.UInt32,
    "pdays": pl.UInt32,
    "previous": pl.UInt32,
    "poutcome": pl.String,
    "emp.var.rate": pl.Float32,
    "cons.price.idx": pl.Float32,
    "cons.conf.idx": pl.Float32,
    "euribor3m": pl.Float32,
    "nr.employed": pl.Float32, # start with float, cause one value is float
    "y": pl.String,
    "test_control_flag": pl.String
    
}

In [3]:
data = pl.read_csv("data/bank_data_prediction_task_2024.csv", schema=schema, null_values=["NA"])

# change datatypes


In [4]:
#change month to int
data = data.with_columns(pl.col("month")
            .replace({
                "jan": 1,
                "feb": 2,
                "mar": 3,
                "apr": 4,
                "may": 5,
                "jun": 6,
                "jul": 7,
                "aug": 8,
                "sep": 9,
                "oct": 10,
                "nov": 11,
                "dec": 12
            }).cast(pl.UInt32)
        )

In [5]:
#change day of week to int
data = data.with_columns(pl.col("day_of_week")
            .replace({
                "mon": 1,
                "tue": 2,
                "wed": 3,
                "thu": 4,
                "fri": 5
            }).cast(pl.UInt32)
        )

In [6]:
#turn nr.employed to int
data = data.with_columns(pl.col("nr.employed").cast(pl.Int32))

In [7]:
#turn y to 0 and 1
data = data.with_columns(pl.col("y").replace({'no': 0, 'yes': 1}).cast(pl.UInt32))

In [8]:
data = data.with_columns(
    pl.col("test_control_flag")
    .replace({'campaign group': 1, 'control group': 0}).cast(pl.UInt32)
    ).rename({"test_control_flag": "in_campaign_group"}
)

In [9]:
data.describe()

statistic,id,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,in_campaign_group
str,f64,f64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64
"""count""",41188.0,41188.0,"""41188""","""41188""","""41188""","""41188""","""41188""","""41188""","""16476""",16476.0,16476.0,16476.0,16476.0,41188.0,41188.0,"""41188""",41188.0,40938.0,41188.0,41188.0,41188.0,41188.0,41188.0
"""null_count""",0.0,0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""24712""",24712.0,24712.0,24712.0,24712.0,0.0,0.0,"""0""",0.0,250.0,0.0,0.0,0.0,0.0,0.0
"""mean""",20594.5,40.02406,,,,,,,,6.599539,2.984644,259.660172,2.546856,962.475454,0.172963,,0.081886,93.575951,-40.502602,3.621291,5166.849446,0.125473,0.400019
"""std""",11890.09578,10.42125,,,,,,,,2.041805,1.409035,260.660561,2.699664,186.910907,0.494901,,1.57096,0.57889,4.628198,1.734447,72.328375,0.331259,0.489908
"""min""",1.0,17.0,"""admin.""","""divorced""","""basic.4y""","""no""","""no""","""no""","""cellular""",3.0,1.0,0.0,1.0,0.0,0.0,"""failure""",-3.4,92.200996,-50.799999,0.634,4963.0,0.0,0.0
"""25%""",10298.0,32.0,,,,,,,,5.0,2.0,103.0,1.0,999.0,0.0,,-1.8,93.074997,-42.700001,1.344,5099.0,0.0,0.0
"""50%""",20595.0,38.0,,,,,,,,6.0,3.0,180.0,2.0,999.0,0.0,,1.1,93.749001,-41.799999,4.857,5191.0,0.0,0.0
"""75%""",30891.0,47.0,,,,,,,,8.0,4.0,321.0,3.0,999.0,0.0,,1.4,93.994003,-36.400002,4.961,5228.0,0.0,1.0
"""max""",41188.0,98.0,"""unknown""","""unknown""","""unknown""","""yes""","""yes""","""yes""","""telephone""",12.0,5.0,4918.0,43.0,999.0,7.0,"""success""",1.4,94.766998,-26.9,5.045,5228.0,1.0,1.0


# Missing values

we have two places of missing values:
1) for control group all of contact, month, day_of_week, campaign, duration are empty
2) for consumer price index there are 250 missing values. 

In [10]:
#we can see that the observations are rather sorted, hence we can fill the null values with forward fill
data = data.with_columns(pl.col("cons.price.idx").forward_fill())

## Data exploration

### Expectations:
- H1: data is unbalanced - number of not subscribed observations will be much higher
- H2: Mostly elderly people subscribe to the product
- H3: People with loans (housing or loan) or with payment issues (default) are less likely to subscribe
- H4: Social and economic indicators have low impact on the outcome variable

### Dataset balance

In [11]:
# find balance of the dataset:
counts = data.group_by("y").agg(pl.count("y").alias("count"))
fig = px.bar(counts, y='count', x="y", title="Balance of the dataset").show()

Only 5168 people subscribed to the product, while over 36 000 did not. It is a clear indicator off unbalanced dataset

### Age distribution

the data is heavily unbalanced, indicating that the conversion rate of people who decide to subscribe to product is low.

In [12]:
# plot the age distribution for outcome variable (product subscription) distinguishing between the control and campaign group
fig = px.violin(data, y='age', x='y', color='in_campaign_group', box=True, points='all', title='Age Distribution by y and in_campaign_group').show()





- We can see that all distribusions for age are higlhy right skewed, indicating that most of observations where conducted on age group under 40 years old. 
- The group who decided to subscribe to product has a thincker tail, but it is not an indicator that edlerly people are more likely to subscribe, as the distributions are simmilar to those who did not decide to subscribe
- the median age for not subscribers is 38 years old, while for subscribers it is 37

### Other categorical variables

In [13]:
def plot_categorical(data, column):
    grouped_data = data.group_by(column, "in_campaign_group", "y").agg(
                pl.len().alias("count")
            ).sort(['count'], descending=True)
    fig = px.bar(grouped_data, x=column, y='count', 
             barmode='stack',
             color='in_campaign_group',
             facet_col='y'
            ).show()

In [14]:
plot_categorical(data, "job")
plot_categorical(data, "marital")
plot_categorical(data, "education")

- the majority of jobs are within admin., blue collar and technician
- the majority of people are married
- the majority of people have university degree or a high school education

- the proportions between those in control and campaign group who chose to subscribe to product is similar for all categories
    - with execptions of blue collar and technician, where more people subscribed when contacted
    - number of retired people who subscribed is proportionaly higher than we would expect if the subscribed part followed the same distribution as not subscribed
    - people with lower education when in campaign group are more likely to subscribe (more contacted people than not contacted between high.school to basic)

### Loans

In [15]:
plot_categorical(data, "housing")
plot_categorical(data, "loan")
plot_categorical(data, "default")

- People with housing loans are proportionaly less likely to subscribe
- There is not real difference between subscription and no subscription for people with loans
- Default column bring almost no value as there is only 1 default observation

### Social and economic indicators

calculate point biseral correlation to efffectively measure relation between continous and outcome binary variable

In [20]:
continuous_vars = ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
correlations = {}
for var in continuous_vars:
    r_pb, p_value = stats.pointbiserialr(data['y'].to_numpy(), data[var].to_numpy())
    correlations[var] = {'Point-Biserial Correlation': r_pb, 'p-value': p_value}

variables = list(correlations.keys())
correlation_values = [correlations[var]['Point-Biserial Correlation'] for var in variables]
p_values = [correlations[var]['p-value'] for var in variables]
corr_df = pl.DataFrame({
    'Variable': variables,
    'Point-Biserial Correlation': correlation_values,
    'p-value': p_values
})

fig = px.bar(corr_df, x='Variable', y='Point-Biserial Correlation').show()

- negative `employment variation rate` correlation indicates an inverse relationship between `emp.var.rate` and binary outcome variable `y`
    - the magnitude being 0.25 means a moderate correlation, not very strong but significant
- negative `consumer price index` correlation indicates an inverse relationship between `cons.price.idx` and binary outcome variable `y`
    - the magnitude being 0.11 means a low correlation, but a significant one
- positive `consumer confidence index` correlation indicates a positive relationship between `cons.conf.idx` and binary outcome variable `y`
    - low significant
- negative `euribor 3 month rate` correlation indicates an inverse relationship between `euribor3m` and binary outcome variable `y`
    - the magnitude being 0.26 means a moderate correlation, not very strong but significant
- negative `number of employees` correlation indicates an inverse relationship between `nr.employed` and binary outcome variable `y`
    - the magnitude being 0.3 means a moderate correlation, not very strong but significant