In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_1 = pd.read_csv("Client_Profiles.txt")
df_2 = pd.read_csv("Digital_Footprints_pt1.txt")
df_3 = pd.read_csv("Digital_Footprints_pt2.txt")
df_4 = pd.read_csv("Experiment_Roster.txt")

In [None]:
df_1.head()

In [None]:
df_1.shape, df_2.shape, df_3.shape, df_4.shape

In [None]:
df_1.columns, df_2.columns, df_3.columns, df_4.columns

In [None]:
df_1.dtypes

In [None]:
df_1.isnull().sum()

In [None]:
df_1["gendr"].unique()

In [None]:
df_1 = df_1.dropna()


In [None]:
df_2.head()

In [None]:
df_2.dtypes

In [None]:
df_2["date_time"] = pd.to_datetime(df_2["date_time"])


In [None]:
df_2.isna().sum()

In [None]:
df_3.head()

In [None]:
df_3["date_time"] = pd.to_datetime(df_3["date_time"])


In [None]:
df_3.isna().sum()

In [None]:
digital_footprints_df = pd.concat([df_2, df_3])

In [None]:
digital_footprints_df.shape

In [None]:
df_4.head()

In [None]:
df_4.dtypes

In [None]:
df_4.isna().sum()

In [None]:
df_4 = df_4.dropna()


In [None]:
df_4 = df_4.rename(columns={"Variation": "variation"})

In [None]:
merged_df = pd.merge(digital_footprints_df, df_4, on='client_id')


In [None]:
merged_df.shape

In [None]:
merged_df.isna().sum()

In [None]:
merged_df.shape

In [None]:
merged_df["process_step"].value_counts()

##### TEST AND CONTROL DF

In [None]:
test_filter_V2 = merged_df.loc[merged_df["variation"] == "Test"]
test_control_V2 = merged_df.loc[merged_df["variation"] == "Control"]

##### FREQUENCY BY STEP FOR TEST GROUP

In [None]:

df_last_step_test_V2 = test_filter_V2.sort_values(by=['client_id', 'date_time']) 
df_last_step_test_V2 = df_last_step_test_V2.groupby('client_id').last().reset_index()

df_last_step_test_frequency_V2 = df_last_step_test_V2["process_step"].value_counts()
df_last_step_test_frequency_V2 = df_last_step_test_frequency_V2.reset_index()
df_last_step_test_frequency_V2.columns = ['step', 'frequency']

df_last_step_test_frequency_V2.head()

##### FREQUENCY BY STEP FOR CONTROL GROUP

In [None]:
df_last_step_control_V2 = test_control_V2.sort_values(by=['client_id', 'date_time']) 
df_last_step_control_V2 = df_last_step_control_V2.groupby('client_id').last().reset_index()

df_last_step_control_frequency_V2 = df_last_step_control_V2["process_step"].value_counts()
df_last_step_control_frequency_V2 = df_last_step_control_frequency_V2.reset_index()
df_last_step_control_frequency_V2.columns = ['step', 'frequency']

df_last_step_control_frequency_V2.head()

#### AGE GROUP

In [None]:
df_1.describe()

In [None]:
df_1['age_group'] = pd.cut(
    df_1['clnt_age'], 
    bins=[0, 18, 25, 35, 50, 65, 100], 
    labels=['<18', '18-25', '26-35', '36-50', '51-65', '65+']
)


age_distribution = df_1['age_group'].value_counts()
age_distribution = age_distribution.reset_index()
age_distribution.columns = ['age_group', 'frequency']

age_distribution.head()

##### TENURE GROUP

In [None]:

df_1['tenure_months'] = df_1['clnt_tenure_yr'] * 12 + df_1['clnt_tenure_mnth']

df_1['tenure_group'] = pd.cut(
    df_1['tenure_months'], 
    bins=[0, 12, 36, 60, 120, 240], 
    labels=['<1 year', '1-3 years', '3-5 years', '5-10 years', '10+ years']
)

tenure_distribution = df_1['tenure_group'].value_counts()
tenure_distribution = tenure_distribution.reset_index()
tenure_distribution.columns = ['tenure_group', 'frequency']

tenure_distribution.head()



In [None]:
# Table croisée âge et ancienneté
age_tenure_analysis = pd.crosstab(df_1['age_group'], df_1['tenure_group'])


In [None]:
age_tenure_analysis.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
cax = ax.matshow(age_tenure_analysis, cmap='coolwarm')

# Add colorbar
fig.colorbar(cax)

# Add labels to the axes
ax.set_xticks(range(len(age_tenure_analysis.columns)))
ax.set_yticks(range(len(age_tenure_analysis.index)))
ax.set_xticklabels(age_tenure_analysis.columns, rotation=45)
ax.set_yticklabels(age_tenure_analysis.index)

# Add title and labels
ax.set_title('Heatmap: Age Group vs Tenure Group', pad=20)
ax.set_xlabel('Tenure Group')
ax.set_ylabel('Age Group')

# Annotate the cells with values
for (i, j), val in np.ndenumerate(age_tenure_analysis.values):
    ax.text(j, i, int(val) if not np.isnan(val) else '', ha='center', va='center', color='black')

# Show the plot
plt.tight_layout()
plt.show()