In [220]:
import random as r
import pandas as pd
import numpy as np


n_samples = 10000


chars = '1234567890abcdefghijk'
user_id_len = 15
user_ids = set()
while len(user_ids) < n_samples:
    s = ''.join(r.choices(chars, k=user_id_len))
    user_ids.add(s)


max_order_num = 9
order_nums = []
for _ in range(1, n_samples + 1):
    order_nums.append(r.randint(1, max_order_num))


mean = 1440
std = 200
delivery_times = np.round(np.random.normal(mean, std, n_samples))


lambda_ = 1
shift = 1
total_sums = np.round(np.random.exponential(lambda_, n_samples) + shift, 3)


retention_probabilities = [0.35, 0.25, 0.2, 0.15, 0.05]
retention_numbers = [1, 2, 3, 4, 5]
retentions = np.random.choice(retention_numbers, n_samples, p=retention_probabilities)


data = pd.DataFrame({
    'user_id': list(user_ids),
    'order_num': order_nums,
    'delivery_time': delivery_times,
    'total_sum': total_sums,
    'retention': retentions
    })

data

Unnamed: 0,user_id,order_num,delivery_time,total_sum,retention
0,0856e61e99gggj7,3,1188.0,1.849,3
1,b645fj3fff0d88e,1,1415.0,1.630,1
2,9fe6f11abj99398,7,1798.0,1.933,2
3,df528ce7d9a6kjd,7,1264.0,1.696,1
4,dhg0h72e8jd78c0,2,1605.0,2.671,3
...,...,...,...,...,...
9995,g24e877df96917j,4,1213.0,1.618,1
9996,ebjh4j6cb1i4i5e,2,1559.0,2.105,3
9997,068bhi4a032g4h1,6,1435.0,2.950,1
9998,fggedi260a9f36i,1,1265.0,1.227,2


In [221]:
mean_delivery_time_data = pd.DataFrame({'mean_delivery_time': np.round(data.groupby('order_num')['delivery_time'].mean())})
merged_data = pd.merge(data, mean_delivery_time_data, on='order_num')
merged_data

Unnamed: 0,user_id,order_num,delivery_time,total_sum,retention,mean_delivery_time
0,0856e61e99gggj7,3,1188.0,1.849,3,1441.0
1,b645fj3fff0d88e,1,1415.0,1.630,1,1437.0
2,9fe6f11abj99398,7,1798.0,1.933,2,1436.0
3,df528ce7d9a6kjd,7,1264.0,1.696,1,1436.0
4,dhg0h72e8jd78c0,2,1605.0,2.671,3,1436.0
...,...,...,...,...,...,...
9995,g24e877df96917j,4,1213.0,1.618,1,1436.0
9996,ebjh4j6cb1i4i5e,2,1559.0,2.105,3,1436.0
9997,068bhi4a032g4h1,6,1435.0,2.950,1,1440.0
9998,fggedi260a9f36i,1,1265.0,1.227,2,1437.0


In [222]:
n_in_every_segment = 3
num_segments = (max_order_num + n_in_every_segment - 1) // n_in_every_segment
bins = [i * n_in_every_segment for i in range(num_segments + 1)]
labels = [f'{i * n_in_every_segment + 1}-{(i + 1) * n_in_every_segment}' for i in range(num_segments)]

merged_data['segment'] = pd.cut(merged_data['order_num'], bins=bins, labels=labels)
merged_data

Unnamed: 0,user_id,order_num,delivery_time,total_sum,retention,mean_delivery_time,segment
0,0856e61e99gggj7,3,1188.0,1.849,3,1441.0,1-3
1,b645fj3fff0d88e,1,1415.0,1.630,1,1437.0,1-3
2,9fe6f11abj99398,7,1798.0,1.933,2,1436.0,7-9
3,df528ce7d9a6kjd,7,1264.0,1.696,1,1436.0,7-9
4,dhg0h72e8jd78c0,2,1605.0,2.671,3,1436.0,1-3
...,...,...,...,...,...,...,...
9995,g24e877df96917j,4,1213.0,1.618,1,1436.0,4-6
9996,ebjh4j6cb1i4i5e,2,1559.0,2.105,3,1436.0,1-3
9997,068bhi4a032g4h1,6,1435.0,2.950,1,1440.0,4-6
9998,fggedi260a9f36i,1,1265.0,1.227,2,1437.0,1-3


In [223]:
total_sum_desc = pd.DataFrame(merged_data['total_sum'].describe()).T
total_sum_desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_sum,10000.0,1.988147,1.015049,1.0,1.275,1.678,2.35525,10.654


In [228]:
most_frequent_total_sum = merged_data['total_sum'].value_counts()


print(f'Медиана: {total_sum_desc["50%"].values[0]}')
print(f'Мода: {most_frequent_total_sum.index[0]}; встречается {most_frequent_total_sum.iloc[0]} раз')
print(f'Среднее: {total_sum_desc["mean"].values[0]}')
print(f'Дисперсия: {np.var(merged_data["total_sum"])}')
print(f'Стандартное отклонение: {total_sum_desc["std"].values[0]}')

Медиана: 1.678
Мода: 1.002; встречается 17 раз
Среднее: 1.9881471000000002
Дисперсия: 1.0302216816615901
Стандартное отклонение: 1.0150491190740492
