## Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install transformers evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

#### Calculating retention

First of all - let's calculate how much time passed since review till next order (successful or not)

In [None]:
# Conerting all dates to datetime
ret_r.reviewed_at = pd.to_datetime(ret_r.reviewed_at, format='ISO8601', errors='coerce')
ret_r.next_order_at = pd.to_datetime(ret_r.next_order_at, format='ISO8601')
ret_r.next_attempt_at = pd.to_datetime(ret_r.next_attempt_at, format='ISO8601')

In [None]:
ret_r

Unnamed: 0,reviewed_at,next_order_at,n_next_orders,next_attempt_at,n_next_attempts,review_id
0,2012-06-14 15:16:50.811775,NaT,,NaT,,GJTC49354221161598
1,2012-06-29 23:53:31.994031,NaT,,NaT,,VMJN41421800557999
2,2012-06-29 23:57:47.972265,NaT,,NaT,,NCEG44064480963891
3,2012-06-30 00:03:07.388384,NaT,,NaT,,PBRX99505364679151
4,2012-07-01 16:34:38.408675,NaT,,NaT,,HSKQ74933993814216
...,...,...,...,...,...,...
432080,2024-03-03 15:18:04.619860,NaT,,NaT,,SEFX10496344799622
432081,2024-03-03 15:20:50.575023,2024-03-20 12:08:41.625721,1.0,2024-03-20 12:08:41.625721,1.0,QTRG74267350206397
432082,2024-03-03 15:22:57.136125,NaT,,NaT,,FTFE04220030445845
432083,2024-03-03 15:45:53.300262,NaT,,NaT,,IBGP45892704571543


In [None]:
%%time
ret_r['order_time_span'] = ret_r.next_order_at - ret_r.reviewed_at
ret_r['attempt_time_span'] = ret_r.next_attempt_at - ret_r.reviewed_at

CPU times: user 28.4 ms, sys: 7.11 ms, total: 35.5 ms
Wall time: 76.4 ms


In [None]:
# We have some empty timedates for reviews due to format issues
ret_r['reviewed_at'].isna().sum()

103

In [None]:
# There are few of them - 0.02% of dataset - we will drop them
103 / 432085 * 100

0.023837902264600714

In [None]:
ret_r = ret_r[ret_r.reviewed_at.notna()].reset_index(drop=True)

In [None]:
len(ret_r)

431982

#### Surface EDA

Let's look at the data to get a general idea of how many users made new orders after posting a review on previous one

In [None]:
ret_r.order_time_span.notna().value_counts()

order_time_span
False    334151
True      97831
Name: count, dtype: int64

In [None]:
ret_r.order_time_span.notna().value_counts(normalize=True)

order_time_span
False    0.77353
True     0.22647
Name: proportion, dtype: float64

In [None]:
ret_r.attempt_time_span.notna().value_counts()

attempt_time_span
False    318661
True     113321
Name: count, dtype: int64

In [None]:
ret_r.attempt_time_span.notna().value_counts(normalize=True)

attempt_time_span
False    0.737672
True     0.262328
Name: proportion, dtype: float64

In [None]:
0.262328 - 0.22647

0.035858

In [None]:
ret_r.order_time_span.describe()

count                          97831
mean     335 days 10:08:42.040463540
std      420 days 23:44:46.713978656
min           0 days 00:00:05.218917
25%       47 days 04:35:13.455746500
50%         196 days 10:33:10.202459
75%      414 days 11:53:47.447480500
max        3578 days 17:14:05.046081
Name: order_time_span, dtype: object

In [None]:
ret_r.attempt_time_span.describe()

count                         113321
mean     317 days 23:16:41.022435760
std      411 days 03:32:19.331871344
min           0 days 00:00:05.218917
25%          38 days 14:08:37.291860
50%         177 days 09:16:00.274629
75%         390 days 11:11:16.768020
max        3606 days 17:28:08.934207
Name: attempt_time_span, dtype: object

In [None]:
(ret_r[ret_r.order_time_span.notna()].order_time_span<pd.Timedelta('390 days')).value_counts(normalize=True)

order_time_span
True     0.734052
False    0.265948
Name: proportion, dtype: float64

In [None]:
pd.Timestamp.now() - pd.Timedelta('390 days')

Timestamp('2023-03-22 21:09:05.403505')

In [None]:
ret_r[ret_r.reviewed_at<pd.to_datetime('2023-03-12 00:00:00')]

Unnamed: 0,reviewed_at,next_order_at,n_next_orders,next_attempt_at,n_next_attempts,review_id,order_time_span,attempt_time_span
0,2012-06-14 15:16:50.811775,NaT,,NaT,,GJTC49354221161598,NaT,NaT
1,2012-06-29 23:53:31.994031,NaT,,NaT,,VMJN41421800557999,NaT,NaT
2,2012-06-29 23:57:47.972265,NaT,,NaT,,NCEG44064480963891,NaT,NaT
3,2012-06-30 00:03:07.388384,NaT,,NaT,,PBRX99505364679151,NaT,NaT
4,2012-07-01 16:34:38.408675,NaT,,NaT,,HSKQ74933993814216,NaT,NaT
...,...,...,...,...,...,...,...,...
431518,2022-11-14 00:00:00.000000,NaT,,NaT,,RVIQ91359111559822,NaT,NaT
431519,2022-05-29 00:00:00.000000,NaT,,NaT,,CQKR40020779443842,NaT,NaT
431522,2022-10-10 00:00:00.000000,NaT,,NaT,,JIIX49987577885782,NaT,NaT
431523,2023-01-13 00:00:00.000000,NaT,,NaT,,PPSY59793113061805,NaT,NaT


In [None]:
ret_r[ret_r.reviewed_at<pd.to_datetime('2023-03-12 00:00:00')].order_time_span.notna().value_counts()

order_time_span
False    202875
True      74335
Name: count, dtype: int64

In [None]:
ret_r[ret_r.reviewed_at<pd.to_datetime('2023-03-12 00:00:00')].order_time_span.notna().value_counts(normalize=True)

order_time_span
False    0.731846
True     0.268154
Name: proportion, dtype: float64

In [None]:
ret_r[ret_r.reviewed_at<pd.to_datetime('2023-03-12 00:00:00')].attempt_time_span.notna().value_counts()

attempt_time_span
False    191485
True      85725
Name: count, dtype: int64

In [None]:
ret_r[ret_r.reviewed_at<pd.to_datetime('2023-03-12 00:00:00')].attempt_time_span.notna().value_counts(normalize=True)

attempt_time_span
False    0.690758
True     0.309242
Name: proportion, dtype: float64

__Aged reviews__ - the ones where 13 months have already passed from review posting.
That is a timeframe after which we can conclude whether there was a repeat purchase or not.

In [None]:
aged_reviews = ret_r[ret_r.reviewed_at<pd.to_datetime('2023-03-12 00:00:00')].copy(deep=True)

In [None]:
aged_reviews.reset_index(drop=True, inplace=True)

In [None]:
aged_reviews

Unnamed: 0,reviewed_at,next_order_at,n_next_orders,next_attempt_at,n_next_attempts,review_id,order_time_span,attempt_time_span
0,2012-06-14 15:16:50.811775,NaT,,NaT,,GJTC49354221161598,NaT,NaT
1,2012-06-29 23:53:31.994031,NaT,,NaT,,VMJN41421800557999,NaT,NaT
2,2012-06-29 23:57:47.972265,NaT,,NaT,,NCEG44064480963891,NaT,NaT
3,2012-06-30 00:03:07.388384,NaT,,NaT,,PBRX99505364679151,NaT,NaT
4,2012-07-01 16:34:38.408675,NaT,,NaT,,HSKQ74933993814216,NaT,NaT
...,...,...,...,...,...,...,...,...
277205,2022-11-14 00:00:00.000000,NaT,,NaT,,RVIQ91359111559822,NaT,NaT
277206,2022-05-29 00:00:00.000000,NaT,,NaT,,CQKR40020779443842,NaT,NaT
277207,2022-10-10 00:00:00.000000,NaT,,NaT,,JIIX49987577885782,NaT,NaT
277208,2023-01-13 00:00:00.000000,NaT,,NaT,,PPSY59793113061805,NaT,NaT


In [None]:
len(aged_reviews)

277210

In [None]:
aged_reviews.loc[aged_reviews.next_order_at.notna(), 'is_retained'] = 0
aged_reviews.loc[aged_reviews.next_order_at.isna(), 'is_retained'] = 1

In [None]:
aged_reviews.is_retained = aged_reviews.is_retained.astype(int)

In [None]:
aged_reviews.is_retained.value_counts()

is_retained
1    202875
0     74335
Name: count, dtype: int64

In [None]:
aged_reviews.is_retained.value_counts(normalize=True)

is_retained
1    0.731846
0    0.268154
Name: proportion, dtype: float64

* Imbalanced
* Can take from it for testing

* Can also take test sample from reviews that didn't "age" but users returned - because here we can be sure in the prediction, because we have a label which we can rely on

In [None]:
ret_r[(ret_r.review_id.isin(aged_reviews.review_id.to_list())==False)&(ret_r.next_order_at.notna())]

Unnamed: 0,reviewed_at,next_order_at,n_next_orders,next_attempt_at,n_next_attempts,review_id,order_time_span,attempt_time_span
274787,2023-03-12 00:21:48.608962,2023-09-22 19:40:14.492454,3.0,2023-09-22 19:40:14.492454,3.0,DVOR49374946873015,194 days 19:18:25.883492,194 days 19:18:25.883492
274788,2023-03-12 00:23:01.962930,2023-07-22 11:32:54.709250,2.0,2023-07-22 11:32:54.709250,2.0,GWCI42252767486898,132 days 11:09:52.746320,132 days 11:09:52.746320
274789,2023-03-12 00:25:25.995472,2023-03-17 15:48:42.763239,1.0,2023-03-17 15:48:42.763239,1.0,DUDH28359778809610,5 days 15:23:16.767767,5 days 15:23:16.767767
274790,2023-03-12 00:55:26.697820,2024-02-10 09:56:00.724876,1.0,2023-09-02 14:53:06.806139,2.0,IRMH70419417005949,335 days 09:00:34.027056,174 days 13:57:40.108319
274791,2023-03-12 00:55:29.784120,2023-04-07 15:12:44.285974,1.0,2023-04-07 15:12:44.285974,1.0,BBZH25491684827443,26 days 14:17:14.501854,26 days 14:17:14.501854
...,...,...,...,...,...,...,...,...
431942,2024-03-03 13:37:14.680667,2024-03-04 20:43:00.383242,1.0,2024-03-04 20:43:00.383242,1.0,JPEZ63016909742499,1 days 07:05:45.702575,1 days 07:05:45.702575
431967,2024-03-03 14:35:01.846451,2024-03-06 13:17:23.419129,2.0,2024-03-06 13:17:23.419129,2.0,ZFNU45008123255687,2 days 22:42:21.572678,2 days 22:42:21.572678
431970,2024-03-03 14:41:40.542628,2024-03-30 15:37:09.329437,1.0,2024-03-04 11:29:48.733035,2.0,XVMW52892907700020,27 days 00:55:28.786809,0 days 20:48:08.190407
431978,2024-03-03 15:20:50.575023,2024-03-20 12:08:41.625721,1.0,2024-03-20 12:08:41.625721,1.0,QTRG74267350206397,16 days 20:47:51.050698,16 days 20:47:51.050698


__Not aged reviews__ - ones where less then 13 months have passed since the posting of review.
Here, if there still is no order, we cannot conclude that repurchase will not be made within the timeframe - thus, we cannot label such datapoints as those who were not retained.
However, if for not aged review the next order is already created, we can conclude that repurchse/retention took place.

In [None]:
notaged_retained = ret_r[(ret_r.review_id.isin(aged_reviews.review_id.to_list())==False)&(ret_r.next_order_at.notna())].reset_index(drop=True)

In [None]:
notaged_retained['is_retained'] = 0

notaged_retained.is_retained = notaged_retained.is_retained.astype(int)

In [None]:
## Saving files
aged_reviews.to_csv('./aged_retention.csv', index=False)
notaged_retained.to_csv('./notaged_retention.csv', index=False)

In [None]:
## Loading files
aged_reviews = pd.read_csv('./aged_retention.csv')
notaged_retained = pd.read_csv('./notaged_retention.csv')

This retention will work as our data labels for classification, where we predict whether the customer will or will not return after the feedback they've given (=experience they've had)

It is much more vital for us to predict (=catch) the users who will not return

Although letting in this group those who actually will return can also taint the model, from the business standpoint it is less crucial - it is much worse if we overlook clients who had some problem due to which they are not willing to buy with us anymore


This is why we're making 1 = won't return, 0 - will return; not return is a target label we're looking for

Also that is why, in terms of metrics, we will look more closely at recall - how well we catch all those who will not return
+ F-ratio

In [None]:
13 * 30

390

### Sampling

#### Let's add tokenized sentences

In [None]:
del old_reviews

In [None]:
old_reviews = pd.read_csv('./prep_melted.csv')

In [None]:
ratings = old_reviews[['REVIEW_ID', 'DETAIL_RATING']]

In [None]:
del old_reviews

In [None]:
tokenized = pd.read_csv('./tokenized_sentences_stem.csv')

In [None]:
tokenized.head()

Unnamed: 0,REVIEW_ID,SENTENCE,SENTENCE_PROC,SENTENCE_STEMMED,0,1,2,3,4,5,...,302,303,304,305,306,307,308,309,310,311
0,ZCCH58446002919703,"окунемся во времена сталинского ампира, посмот...",окунемся времена сталинского ампира посмотрев ...,окунаться сталинский ампир киевския дворец пио...,0.060668,-0.068073,0.016708,-0.107922,0.020048,-0.015384,...,0.024939,0.08216,-0.056301,0.038204,-0.024057,-0.039221,-0.054741,0.068908,0.035215,-0.13479
1,ZCCH58446002919703,Островского \r\n\r\nдворец пионеров - не ампир...,островского дворец пионеров не ампир исправьте,островский дворец пионер не ампир исправлять,0.010991,-0.041991,-0.003111,-0.118602,0.012937,-0.015553,...,-0.014902,0.060634,-0.048655,0.009744,-0.056312,-0.00437,0.015335,0.040947,0.061047,-0.069445
2,YZFA00909822156169,"Каким бы Вы ни были идиотом, гид составит вам ...",каким идиотом гид составит отличную компанию,идиот гид составлять отличный компания,0.004085,0.035409,-0.012278,-0.125222,0.021568,0.038498,...,0.01445,0.088996,-0.058966,0.034324,0.064058,0.004624,0.034552,0.050623,-0.000239,-0.02417
3,YZFA00909822156169,Вы будете вместе с ним путешествовать к заперт...,путешествовать запертым воротам ходить дворика...,путешествовать запирать ворота ходить дворик м...,-0.06295,-0.07916,-0.004885,-0.081317,-0.041204,-0.011341,...,0.0003,0.009263,-0.018783,0.002006,0.0125,0.002784,-0.041195,0.015865,0.062202,-0.021317
4,YZFA00909822156169,Вам даже удастся перекинуться с ним парой фраз...,удастся перекинуться парои фразо погоде,удаваться перекидываться парой фразо погода,0.016131,-0.091829,0.03175,-0.014206,0.024799,-0.014353,...,0.022738,0.014761,-0.002548,-0.016623,-0.004514,0.015309,0.007502,0.041009,-0.00338,-0.078948


In [None]:
tokenized.drop(columns=['SENTENCE', 'SENTENCE_PROC', 'SENTENCE_STEMMED'], inplace=True)

In [None]:
del tokenized

In [None]:
aged_reviews

Unnamed: 0,reviewed_at,next_order_at,n_next_orders,next_attempt_at,n_next_attempts,review_id,order_time_span,attempt_time_span,is_retained
0,2012-06-14 15:16:50.811775,,,,,GJTC49354221161598,,,1
1,2012-06-29 23:53:31.994031,,,,,VMJN41421800557999,,,1
2,2012-06-29 23:57:47.972265,,,,,NCEG44064480963891,,,1
3,2012-06-30 00:03:07.388384,,,,,PBRX99505364679151,,,1
4,2012-07-01 16:34:38.408675,,,,,HSKQ74933993814216,,,1
...,...,...,...,...,...,...,...,...,...
277205,2022-11-14 00:00:00.000000,,,,,RVIQ91359111559822,,,1
277206,2022-05-29 00:00:00.000000,,,,,CQKR40020779443842,,,1
277207,2022-10-10 00:00:00.000000,,,,,JIIX49987577885782,,,1
277208,2023-01-13 00:00:00.000000,,,,,PPSY59793113061805,,,1


In [None]:
anon_rev = pd.read_csv('./anon_reviews_030324.csv')

  anon_rev = pd.read_csv('./anon_reviews_030324.csv')


In [None]:
sample = aged_reviews[['review_id', 'is_retained']].merge(anon_rev[['REVIEW_ID', 'content']], how='left', left_on='review_id', right_on='REVIEW_ID')

In [None]:
sample.head()

Unnamed: 0,review_id,is_retained,REVIEW_ID,content
0,GJTC49354221161598,1,GJTC49354221161598,We experienced this tour with Yulia. She was a...
1,VMJN41421800557999,1,VMJN41421800557999,"Great guide, helped me figure out which items ..."
2,NCEG44064480963891,1,NCEG44064480963891,She knows a lot of different types of bars tha...
3,PBRX99505364679151,1,PBRX99505364679151,The guide was very knowledgeable of Russian an...
4,HSKQ74933993814216,1,HSKQ74933993814216,Thanks for the tour Katya - it was really inte...


In [None]:
sample = sample[sample.content.notna()].reset_index(drop=True)

In [None]:
sample = sample[['review_id', 'content', 'is_retained']]

In [None]:
sample.to_csv('./sample_for_repurchase.csv', index=False)

In [None]:
sample.is_retained.value_counts(normalize=True)

is_retained
1    0.755264
0    0.244736
Name: proportion, dtype: float64

In [None]:
def is_cyr(text):
    res = False
    for t in text:
        if t in "йцукенгшщзхъфывапролджэячсмитьбю":
            res = True
    return res

In [None]:
sample['cyr'] = sample.content.apply(is_cyr)

In [None]:
sample

Unnamed: 0,review_id,content,is_retained,cyr
0,GJTC49354221161598,We experienced this tour with Yulia. She was a...,1,False
1,VMJN41421800557999,"Great guide, helped me figure out which items ...",1,False
2,NCEG44064480963891,She knows a lot of different types of bars tha...,1,False
3,PBRX99505364679151,The guide was very knowledgeable of Russian an...,1,False
4,HSKQ74933993814216,Thanks for the tour Katya - it was really inte...,1,False
...,...,...,...,...
139901,RVIQ91359111559822,Хочу поблагодарить Юлию за экскурсию и красоту...,1,True
139902,CQKR40020779443842,Юлия - прекрасный рассказчик и замечательный о...,1,True
139903,JIIX49987577885782,Это одна из лучших поездок с гидом за последне...,1,True
139904,PPSY59793113061805,"Добрый день, Юлия! Огромное спасибо за экскурс...",1,True


In [None]:
sample.cyr.value_counts()

cyr
True     138102
False      1804
Name: count, dtype: int64

In [None]:
sample = sample[sample.cyr]

In [None]:
sample.drop(columns=['cyr'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample.drop(columns=['cyr'], inplace=True)


In [None]:
sample

Unnamed: 0,review_id,content,is_retained
27,BMKH63293691796166,"Cпасибо за интересный, познавательный и весёлы...",1
32,LMCZ80814792520945,"Дима - исключительно приятный собеседник, инте...",1
33,OXKB28872614231621,Пал жертвой экскурсии по узбекской еде:) Как и...,1
36,OQAZ96519979249576,"Экскурсия понравилась.Мелкий моросящий дождь,с...",0
38,BJCL10258909548348,"Экскурсия понравилась. Прогулка динамичная, де...",0
...,...,...,...
139901,RVIQ91359111559822,Хочу поблагодарить Юлию за экскурсию и красоту...,1
139902,CQKR40020779443842,Юлия - прекрасный рассказчик и замечательный о...,1
139903,JIIX49987577885782,Это одна из лучших поездок с гидом за последне...,1
139904,PPSY59793113061805,"Добрый день, Юлия! Огромное спасибо за экскурс...",1


## Retraining Model

### Import of required libraries; setting up functions

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("laskovey/review_train3")
model = AutoModel.from_pretrained("laskovey/review_train3")
model.cuda()  # uncomment it if you have a GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.44k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
   

In [4]:
from transformers import TrainerCallback
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback

from datasets import Features, Sequence, Value, ClassLabel

from datasets import Dataset

import evaluate

In [5]:
metric = evaluate.load("f1", average='macro')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [7]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Train Test Split

#### Let's now create Train and Test samples from tokenized sentences

In [8]:
sample = pd.read_csv('./sample_for_repurchase.csv')

In [9]:
sample

Unnamed: 0,review_id,content,is_retained
0,GJTC49354221161598,We experienced this tour with Yulia. She was a...,1.0
1,VMJN41421800557999,"Great guide, helped me figure out which items ...",1.0
2,NCEG44064480963891,She knows a lot of different types of bars tha...,1.0
3,PBRX99505364679151,The guide was very knowledgeable of Russian an...,1.0
4,HSKQ74933993814216,Thanks for the tour Katya - it was really inte...,1.0
...,...,...,...
41885,PDYS78311348550442,"В целом все понравилось, общее впечатление от ...",1.0
41886,YDTU85510938815299,"Спасибо большое, всё очень понравилось.",1.0
41887,ULYM98697121336043,Отлично!,0.0
41888,OFEL31300371377337,На экскурсии узнали много новых интересных фак...,1.0


In [15]:
sample.drop(columns=['review_id'], inplace=True)

In [47]:
sample[sample.is_retained.isna()]

Unnamed: 0,content,is_retained
41889,Очень интересная экскурсия.,


In [48]:
sample = sample[sample.is_retained.isna()==False]

In [49]:
test_sample = sample.groupby('is_retained').sample(frac=0.1, random_state=42)

In [50]:
train_sample = sample[sample.index.isin(test_sample.index.to_list())==False]

In [18]:
tr_df = train_sample.groupby('is_retained').sample(frac=0.1, random_state=42).copy(deep=True)
te_df = test_sample.groupby('is_retained').sample(frac=0.1, random_state=42).copy(deep=True)

In [19]:
tr_df.columns = ['text', 'label']
te_df.columns = ['text', 'label']


In [20]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [21]:
tr_df

Unnamed: 0,text,label
0,Даёт возможность сконцентрироваться на своих в...,0.0
1,"Прибыли на площадь Лаек, некого не было! Пытал...",0.0
2,"Отличная экскурсия! Гид, Ольга, большая молоде...",0.0
3,Спасибо за отличную организацию!\r\nВсе четко ...,0.0
4,Спасибо за ответственный и не стандартный подх...,0.0
...,...,...
3765,Гид-Ольга. Очень содержательно и разносторонне...,1.0
3766,Очень классно. В жаркую погоду не рекомендую. ...,1.0
3767,Все очень понравилось,1.0
3768,Спасибо за гида Александра (не хотелось с ним ...,1.0


In [22]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=[i for i in range(tr_df.label.nunique())])})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [23]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [24]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3770 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

In [25]:
tok_tr_dataset[3]

{'text': 'Спасибо за отличную организацию!\r\nВсе четко по времени!\r\nГид Лариса стала на время экскурсии главой большой семьи.\r\nПомогла с обменом валюты, всё очень доходчиво объяснила.\r\nВеликолепный обед и дегустация вин.\r\nЕщё раз спасибо!',
 'label': 0,
 'input_ids': [2,
  32426,
  650,
  60462,
  34114,
  5,
  9057,
  33376,
  705,
  4204,
  5,
  37939,
  51719,
  3583,
  548,
  1614,
  39182,
  34905,
  9989,
  11911,
  18,
  34664,
  13678,
  329,
  31587,
  761,
  38732,
  16,
  7757,
  6003,
  24643,
  40700,
  44385,
  18,
  73356,
  1241,
  41779,
  320,
  70136,
  2356,
  61246,
  18,
  18624,
  4495,
  33020,
  5,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [26]:
MODEL_NAME = 'laskovey/review_train3'

In [27]:
MODEL_NAME

'laskovey/review_train3'

In [28]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at laskovey/review_train3 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([34]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([34, 312]) in the checkpoint and torch.Size([2, 312]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
!huggingface-cli login --token #MY_HuggingFace_TOKEN_HERE

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [30]:
training_args = TrainingArguments(
    output_dir="repurchase_train3",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=32,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [31]:
trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,0.5547,0.533145,0.444294
2,0.5344,0.529214,0.444294
3,0.5225,0.530248,0.444294
4,0.5096,0.525615,0.442554
5,0.4869,0.531555,0.449005
6,0.4618,0.534739,0.486899
7,0.4286,0.542537,0.500941
8,0.3974,0.569738,0.533626
9,0.3591,0.577739,0.520092
10,0.3247,0.612014,0.564353


TrainOutput(global_step=3776, training_loss=0.24680110105013442, metrics={'train_runtime': 391.815, 'train_samples_per_second': 307.9, 'train_steps_per_second': 9.637, 'total_flos': 289421953618176.0, 'train_loss': 0.24680110105013442, 'epoch': 32.0})

In [32]:
trainer.evaluate()

{'eval_loss': 0.8978793025016785,
 'eval_f1': 0.5827612776688148,
 'eval_runtime': 0.4074,
 'eval_samples_per_second': 1025.952,
 'eval_steps_per_second': 66.27,
 'epoch': 32.0}

In [51]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [52]:
tr_df.columns = ['text', 'label']
te_df.columns = ['text', 'label']


In [53]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [54]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=[i for i in range(tr_df.label.nunique())])})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [55]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [56]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/37700 [00:00<?, ? examples/s]

Map:   0%|          | 0/4189 [00:00<?, ? examples/s]

In [57]:
training_args = TrainingArguments(
    output_dir="repurchase_train4",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=32,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [58]:
trainer.train()


Epoch,Training Loss,Validation Loss,F1
0,0.5267,0.52864,0.442794
2,0.4617,0.563688,0.480426
4,0.3813,0.6495,0.495269
6,0.3115,0.742931,0.525325
8,0.2592,0.879252,0.522794
10,0.2226,0.973256,0.524057
12,0.1948,1.061799,0.527062
14,0.1752,1.162942,0.518267
16,0.156,1.28267,0.522321
18,0.141,1.369982,0.528016


TrainOutput(global_step=37696, training_loss=0.2151576792647357, metrics={'train_runtime': 3208.6018, 'train_samples_per_second': 375.989, 'train_steps_per_second': 11.748, 'total_flos': 3034415942824752.0, 'train_loss': 0.2151576792647357, 'epoch': 31.986423419601188})

In [59]:
trainer.evaluate()

{'eval_loss': 0.940940260887146,
 'eval_f1': 0.5322102889328091,
 'eval_runtime': 6.1771,
 'eval_samples_per_second': 678.15,
 'eval_steps_per_second': 42.415,
 'epoch': 31.986423419601188}

In [60]:
trainer.train()


Epoch,Training Loss,Validation Loss,F1
0,0.2276,0.950569,0.534451
2,0.1904,1.125418,0.530847
4,0.1623,1.229634,0.518502
6,0.1466,1.296921,0.542082
8,0.1316,1.410991,0.533724
10,0.1219,1.443421,0.519537
12,0.1093,1.588094,0.527193
14,0.1048,1.601314,0.526742
16,0.0973,1.598965,0.51499
18,0.0914,1.723703,0.524338


TrainOutput(global_step=37696, training_loss=0.11380808847262215, metrics={'train_runtime': 3218.1674, 'train_samples_per_second': 374.872, 'train_steps_per_second': 11.713, 'total_flos': 3034415942824752.0, 'train_loss': 0.11380808847262215, 'epoch': 31.986423419601188})

In [82]:
X = test_sample.content.to_list()
y_act = test_sample.is_retained.astype(int).to_list()

In [77]:
test_sample

Unnamed: 0,content,is_retained
32312,"Обзорная экскурсия очень динамичная, содержате...",0.0
19384,"Интересная экскурсия пропитанная историей, кул...",0.0
3349,"Отличная экскурсия, прекрасный экскурсовод Агн...",0.0
31093,"Колизей прекрасен. Экскурсии не было, как и По...",0.0
27545,"Поправилась очень гид! Интересная, эмоциональн...",0.0
...,...,...
18834,За последние годы довелось посетить немало экс...,1.0
35926,Мне понравилась экскурсия. Очень красиво и инт...,1.0
3760,"обычная, стандартная экскурсия.очень короткая",1.0
34158,"Спасибо Виктории, очень интересно и многое узн...",1.0


In [78]:
labels = test_sample.is_retained.astype(int).unique()


In [66]:
from transformers import pipeline
text_classification_pipeline = pipeline("text-classification", model="laskovey/repurchase_train4")



config.json:   0%|          | 0.00/750 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [67]:
y_pred = [result["label"] for result in text_classification_pipeline(X)]

In [73]:
y_pred2 = [int(i[-1]) for i in y_pred]

In [74]:
y_pred2[:10]

[1, 1, 0, 0, 1, 1, 1, 1, 1, 0]

In [68]:
from sklearn.metrics import classification_report

In [79]:
labels

array([0, 1])

In [84]:
print(classification_report(y_pred2, y_act, labels=labels, zero_division=0))


              precision    recall  f1-score   support

           0       0.20      0.29      0.24       673
           1       0.85      0.78      0.82      3516

    accuracy                           0.70      4189
   macro avg       0.53      0.54      0.53      4189
weighted avg       0.75      0.70      0.72      4189



In [86]:
sample.is_retained = sample.is_retained.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample.is_retained = sample.is_retained.astype(int)


In [87]:
sample.is_retained.value_counts()

is_retained
1    32236
0     9653
Name: count, dtype: int64

In [91]:
new_sample = sample.groupby('is_retained').sample(9653, random_state=42)

In [92]:
new_sample = new_sample.sample(frac=1)
new_sample

Unnamed: 0,content,is_retained
15139,Очень довольна поездкой и экскурсоводом Еленой...,0
117,Неповторимо! Необыкновенно и… неожиданно :) По...,1
4589,Все супер!!!!!!! интересно и взрослым и мальчи...,0
27254,Замечательный гид был у нас! Шикарная экскурсия!,1
32931,Мы ездили семьей. Дочка уже взрослая и ей коне...,1
...,...,...
5094,Гид Алексей молодец! Браво ! Экскурсия тоже су...,1
8828,"Очень понравилась экскурсия, было познавательн...",0
5189,"Экскурсия очень интересная, необычный формат, ...",1
8462,"Экскурсия проведена содержательно, профессиона...",1


In [None]:
new_sample

In [93]:
test_sample = new_sample.groupby('is_retained').sample(frac=0.1, random_state=42)

In [95]:
train_sample = new_sample[new_sample.index.isin(test_sample.index.to_list())==False]

In [96]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [97]:
tr_df.columns = ['text', 'label']
te_df.columns = ['text', 'label']


In [98]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [99]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=[i for i in range(tr_df.label.nunique())])})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [100]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [101]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/17376 [00:00<?, ? examples/s]

Map:   0%|          | 0/1930 [00:00<?, ? examples/s]

In [102]:
training_args = TrainingArguments(
    output_dir="repurchase_train4",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=32,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [103]:
trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,0.3402,0.303776,0.876673
2,0.2766,0.305155,0.880307
3,0.2393,0.317074,0.872516
4,0.2092,0.334284,0.870864
5,0.1881,0.358833,0.873871
6,0.1735,0.373285,0.870341
7,0.1557,0.389374,0.863546
8,0.1383,0.423395,0.864117
9,0.1359,0.451773,0.862452
10,0.1278,0.462848,0.860044


TrainOutput(global_step=17376, training_loss=0.12167418551927991, metrics={'train_runtime': 1551.3314, 'train_samples_per_second': 358.422, 'train_steps_per_second': 11.201, 'total_flos': 1335550690962240.0, 'train_loss': 0.12167418551927991, 'epoch': 32.0})