In [1]:
import numpy as np 
import pandas as pd

In [28]:
df = pd.read_csv("./articles.csv")
df_customers = pd.read_csv("./customers.csv")
df_transaction_train = pd.read_csv("./transactions_train.csv")

In [29]:
customers = np.random.choice(df_customers['customer_id'],100000, replace=False)

df_customers = df_customers[df_customers['customer_id'].isin(customers)]
df_transaction_train = df_transaction_train[df_transaction_train['customer_id'].isin(customers)]

In [30]:
merged_df = df_transaction_train.merge(df, on='article_id', validate='many_to_one')
merged_df = merged_df.merge(df_customers,on='customer_id',validate='many_to_one')

In [20]:
df_transaction_train['price'].describe()

count    1.468412e+06
mean     2.778745e-02
std      1.915728e-02
min      1.016949e-04
25%      1.567797e-02
50%      2.540678e-02
75%      3.388136e-02
max      5.067797e-01
Name: price, dtype: float64

In [34]:
import pandas as pd


q1 = merged_df['price'].quantile(0.25)
q2 = merged_df['price'].quantile(0.50)
q3 = merged_df['price'].quantile(0.75)

low_value_threshold = q1
medium_value_threshold = q2
high_value_threshold = q3

def assign_monetary_segment(price):
    if price <= low_value_threshold:
        return "Low-Value Customer"
    elif price <= medium_value_threshold:
        return "Medium-Value Customer"
    else:
        return "High-Value Customer"

merged_df['Monetary_Segment'] = merged_df['price'].apply(assign_monetary_segment)

print(merged_df[['customer_id', 'price', 'Monetary_Segment']])

                                               customer_id     price  \
350012   0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...  0.016932   
350014   0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...  0.025407   
350015   0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...  0.067780   
350016   0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...  0.067780   
350020   0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...  0.033881   
...                                                    ...       ...   
2150194  fffc04eab174fa4aff0f96a64bc3cbd69755fa02b6a7f8...  0.042356   
1812389  fffec3dbcc87c78300f37f62cfca4274f1ea6ef59dba10...  0.010153   
1812392  fffec3dbcc87c78300f37f62cfca4274f1ea6ef59dba10...  0.016932   
2322259  fffeec73fb0d1884cd99b8ace0044b94f72d671672cbb6...  0.022864   
2322260  fffeec73fb0d1884cd99b8ace0044b94f72d671672cbb6...  0.022864   

              Monetary_Segment  
350012   Medium-Value Customer  
350014   Medium-Value Customer  
350015     High-Value Customer  
350

In [35]:
import plotly.express as px


merged_df['t_dat']=pd.to_datetime(merged_df['t_dat'],errors="coerce")
merged_df['year'] = merged_df['t_dat'].dt.year


unique_customers_yoy = merged_df.groupby('year')['customer_id'].nunique().reset_index()

unique_customers_yoy['yoy_change'] = unique_customers_yoy['customer_id'].pct_change()

fig1 = px.bar(unique_customers_yoy, x='year', y='customer_id', title='Total Unique Customers (YoY)')
fig1.update_xaxes(title_text='Year')
fig1.update_yaxes(title_text='Total Unique Customers')

fig2 = px.line(unique_customers_yoy, x='year', y='yoy_change', title='YoY Change in Unique Customers')
fig2.update_xaxes(title_text='Year')
fig2.update_yaxes(title_text='YoY Change (%)')

fig1.show()
fig2.show()

In [36]:
!pip install plotly
import pandas as pd
import plotly.express as px


merged_df.sort_values(by=['year', 'customer_id'], inplace=True)


previous_year_customers = set()


years = []
new_customers_count = []
lost_customers_count = []
total_customers_count = []


for year in merged_df['year'].unique():
    current_year_customers = set(merged_df[merged_df['year'] == year]['customer_id'])
    
    
    new_customers = current_year_customers - previous_year_customers
    
    
    lost_customers = previous_year_customers - current_year_customers
    
    
    previous_year_customers = current_year_customers
    
  
    years.append(year)
    new_customers_count.append(len(new_customers))
    lost_customers_count.append(len(lost_customers))
    
    
    total_customers_count.append(len(current_year_customers))


plot_data = pd.DataFrame({'Year': years, 'New Customers': new_customers_count, 'Lost Customers': lost_customers_count, 'Total Customers': total_customers_count})


fig = px.bar(
    plot_data,
    x='Year',
    y=['New Customers', 'Lost Customers', 'Total Customers'],
    title='New, Lost, and Total Customers Over the Years',
    labels={'value': 'Count'},
)

fig.show()



In [37]:
plot_data

Unnamed: 0,Year,New Customers,Lost Customers,Total Customers
0,2018,42366,0,42366
1,2019,39308,10078,71596
2,2020,19286,27957,62925
