In [1]:
import pandas as pd
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

### Загрузка и обработка данных

In [2]:
df=pd.read_csv('Data.csv')

In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%Y-%m-%d')
df['InvoiceDate'] = df['InvoiceDate'].dt.strftime('%Y-%m-%d')


In [80]:
df=df[df['Quantity']>0]
df['Total']=df['Quantity']*df['Price']

In [81]:
df2=df[df.InvoiceDate.str.contains(r'2010-12-\d\d', regex= True, na=False)]

In [82]:
# df2=df[df['InvoiceDate']=='2010-12-23']

In [83]:
df_group=df2.groupby(['Customer ID', 'Country']).agg({'Total': ['sum']}). reset_index()
df_group.columns = df_group.columns.droplevel(1)

In [84]:
df_group=df_group[df_group['Country']!='United Kingdom']
df_group

Unnamed: 0,Customer ID,Country,Total
0,12347.0,Iceland,711.79
1,12348.0,Finland,892.80
2,12370.0,Austria,277.20
3,12370.0,Cyprus,1587.07
4,12377.0,Switzerland,1001.52
...,...,...,...
350,14911.0,EIRE,7737.94
352,14932.0,Channel Islands,363.53
409,15332.0,Lithuania,1661.06
680,17097.0,Spain,174.72


### Полное совместное распределение

Интересующее событие - стоимость чека декабря 2010 года,
Событие наблюдение - страна 

In [85]:
df_group['Max_Price']=(df_group['Total']//100+1)*100
df_group['Max_Price'] = df_group['Max_Price'].astype(int) 

In [86]:
country=df_group['Country'].unique()
max_price=df_group['Max_Price'].unique()

In [87]:
df_country=pd.DataFrame(data=country, columns=['country'])
df_country['key']=0

In [88]:
df_max_price=pd.DataFrame(data=max_price, columns=['max_price'])
df_max_price['key']=0

In [89]:
df_prob=df_country.merge(df_max_price, on='key', how='outer')
df_prob['probability']=0

In [90]:
len_df=len(df_group)

In [91]:
def count_row(country, max_price):
    df_temp=df_group[(df_group['Country']==country)&(df_group['Max_Price']==max_price)]
    return len(df_temp)

In [92]:
for i, row in df_prob.iterrows():
    country_temp=row['country']
    max_price_temp=row['max_price']
    res=count_row(country_temp, max_price_temp)
    df_prob.iat[i, 3] = res/len_df

In [93]:
df_prob=df_prob[['country', 'max_price', 'probability']].sort_values('probability', ascending=False)
df_prob

Unnamed: 0,country,max_price,probability
219,France,200,0.070423
156,Germany,200,0.042254
214,France,1100,0.042254
107,Belgium,300,0.028169
261,Spain,200,0.028169
...,...,...,...
143,Australia,2200,0.000000
142,Australia,500,0.000000
141,Australia,1200,0.000000
140,Australia,1400,0.000000


### Частное распределение

#### P(country)

In [94]:
df_P_c=df_prob.groupby(['country']).agg({'probability': ['sum']}). reset_index()
df_P_c.columns = df_P_c.columns.droplevel(1)
df_P_c

Unnamed: 0,country,probability
0,Australia,0.028169
1,Austria,0.014085
2,Belgium,0.056338
3,Channel Islands,0.014085
4,Cyprus,0.014085
5,Denmark,0.014085
6,EIRE,0.042254
7,Finland,0.014085
8,France,0.211268
9,Germany,0.253521


#### P(max_price)

In [95]:
df_P_p=df_prob.groupby(['max_price']).agg({'probability': ['sum']}). reset_index()
df_P_p.columns = df_P_p.columns.droplevel(1)
df_P_p

Unnamed: 0,max_price,probability
0,100,0.042254
1,200,0.183099
2,300,0.140845
3,400,0.098592
4,500,0.056338
5,700,0.056338
6,800,0.056338
7,900,0.056338
8,1100,0.056338
9,1200,0.028169


### Условное распределение

#### P(max_price>500 | country = Germany)

In [96]:
P_more500_and_UK=df_prob[(df_prob['max_price']>500)&(df_prob['country']=='Germany')]['probability'].sum()

In [97]:
P_UK=df_prob[df_prob['country']=='Germany']['probability'].sum()

In [98]:
P_more500_UK=P_more500_and_UK/P_UK
P_more500_UK

0.5

In [99]:
df_prob

Unnamed: 0,country,max_price,probability
219,France,200,0.070423
156,Germany,200,0.042254
214,France,1100,0.042254
107,Belgium,300,0.028169
261,Spain,200,0.028169
...,...,...,...
143,Australia,2200,0.000000
142,Australia,500,0.000000
141,Australia,1200,0.000000
140,Australia,1400,0.000000


### Предсказание

Ищем вероятность для клиента из Германии с чеком 400-500

In [100]:
obs_country = 'Germany'
obs_max_price = 500  

prob = 1.0
    
# Находим соответствующую вероятность из таблицы условных вероятностей
prob_row = df_prob[(df_prob['max_price'] == obs_max_price) & (df_prob['country'] == obs_country)]

if not prob_row.empty:
    prob *= prob_row['probability'].values[0]

print(f"Вероятность покупки товаров на сумму {obs_max_price} в стране {obs_country} равна {prob}")

Вероятность покупки товаров на сумму 500 в стране Germany равна 0.028169014084507043


#### Предсказание с помощью случайного леса

In [101]:
label_encoder = LabelEncoder()
df_pred=df_prob
df_pred['country'] = label_encoder.fit_transform(df_pred['country'])
df_pred['probability'] = label_encoder.fit_transform(df_pred['probability'])



In [102]:
# Разделяем данные на обучающий и тестовый наборы
X = df_pred.drop('probability', axis=1)
y = df_prob['probability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [103]:
# Модель
model = RandomForestClassifier() 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


Точность модели: 0.8539325842696629


### Оценка

In [105]:
from sklearn.metrics import classification_report 
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        76
           1       0.50      0.36      0.42        11
           2       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1

    accuracy                           0.85        89
   macro avg       0.35      0.33      0.34        89
weighted avg       0.83      0.85      0.84        89



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
