In [3]:
import pandas as pd
import json
import fasttext
import emoji
import re
import collections
from urllib.parse import urlparse

# Loading merged data from channels

In [None]:
df_channels = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/data/merged_dataset/df_channels.csv', header=None)

In [3]:
df_channels.columns = df_channels.iloc[0]
df_channels = df_channels[1:]

In [4]:
df_channels = (df_channels
               .drop(columns=['Unnamed: 0'], axis=1)
               .reset_index(drop=True))

In [5]:
len(df_channels)

8108693

In [6]:
df_channels.sample(5)

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,channel_name,frw_from_title,frw_from_name,msg_entity
6180152,944.0,2018-01-18 06:05:49+00:00,788.0,,PeerChannel(channel_id=1138542535),,"103 –≥–æ–¥–∞ –Ω–∞–∑–∞–¥, 18 —è–Ω–≤–∞—Ä—è 1915 –≥–æ–¥–∞ —Ä–æ–¥–∏–ª—Å—è –≤—ã...",photo,,regnum_na,,,
1059643,102109.0,2022-01-20 06:26:23+00:00,6260.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1050820672),,–†–æ—Å—Ç—É—Ä–∏–∑–º –≤–ø–µ—Ä–≤—ã–µ –ª–∏—à–∏–ª –ø—Ä–∞–≤–∞ –∞–∫–∫—Ä–µ–¥–∏—Ç–∞—Ü–∏–∏ –æ—Ä–≥...,text,,tass_agency,,,
6144226,2548.0,2022-06-23 16:24:40+00:00,50676.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1644818949),,–°–µ–≥–æ–¥–Ω—è –Ω–æ—á—å—é –≤ –ë–∞—Ä—Å–µ–ª–æ–Ω–µ –±—É–¥—É—Ç –æ—Ç–º–µ—á–∞—Ç—å –ø—Ä–∞–∑–¥...,video,9.0,berdovaalena,,,
1106745,54596.0,2021-05-19 13:32:59+00:00,9731.0,,PeerChannel(channel_id=1050820672),,‚ùó –ë—ã–≤—à–∏–π –≤–∏—Ü–µ-–≥—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥–∞ –û–≥–∞–Ω...,text,,tass_agency,,,
332033,11191.0,2022-06-16 20:32:16+00:00,58287.0,,PeerChannel(channel_id=1171552896),,"–ß–∏—Ç–∞—é –º–∞—Ç–µ—Ä–∏–∞–ª—ã –æ —Ç–æ–º, —á—Ç–æ –±–∞–∑—É –¥–ª—è —ç–Ω–µ—Ä–≥–µ—Ç–∏—á–µ...",text,,russ_orientalist,,,


## We drop na for now (needs some mending)

In [7]:
df_channels = df_channels[~df_channels['message'].isna()]

In [8]:
f"We have {len(df_channels[(df_channels['message']=='nan')&(df_channels['type']=='text')].channel_name.unique())/len(df_channels):.4%} nans"

'We have 0.0000% nans'

### Loading list of unique channel name handles referenced by the group

In [9]:
names = (df_channels['frw_from_name']
              .value_counts()
              .reset_index(name="count")[1:]['index']
              .to_list())
len(names)

1959

In [10]:
len(names)

1959

In [11]:
dict_names = {"titles": names}
json_object = json.dumps(dict_names, indent=4)
with open("/Users/katerynaburovova/PycharmProjects/dehumanization/data/names_channels_list.json", "w") as outfile:
    outfile.write(json_object)

# Exploring the data

In [12]:
class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = "/Users/katerynaburovova/PycharmProjects/comp_soc_sci_projects/fasttext/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text, label_only=True):
        predictions = self.model.predict(text, k=1)
        if label_only:
            return predictions[0][0][-2:]
        return predictions

In [13]:
lang_identifier = LanguageIdentification()
df_channels['message'] = df_channels['message'].apply(lambda x: str(x).replace('\n', ' '))
df_channels['lang'] = df_channels['message'].apply(lambda x: lang_identifier.predict_lang(x))



In [14]:
df_channels['lang'].value_counts()

ru    6901187
en      48321
uk      21941
bg       5814
de       5520
       ...   
mf          1
an          1
nb          1
iq          1
ep          1
Name: lang, Length: 138, dtype: int64

# Text preprocesing

In [15]:
df_channels.sample(10)

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,channel_name,frw_from_title,frw_from_name,msg_entity,lang
4482895,58373.0,2021-05-09 17:48:06+00:00,49581.0,,PeerChannel(channel_id=1315735637),"MessageFwdHeader(date=datetime.datetime(2021, ...",‚úîÔ∏è–ü–æ–ª–∏—Ü–∏—è –£–∫—Ä–∞–∏–Ω—ã –≤–æ–∑–±—É–¥–∏–ª–∞ –∫–∞–∫ –º–∏–Ω–∏–º—É–º —Ç—Ä–∏ —É–≥...,text,,SolovievLive,,,,ru
4331800,57594.0,2018-12-05 06:27:23+00:00,6946.0,,PeerChannel(channel_id=1054549314),,–ö–∞–∫ –ø–µ—Ä–µ–≤–µ–∑—Ç–∏ —Ç—Ä—É–±—É –Ω–∞ —Å–∫—É—Ç–µ—Ä–µ –∏ –∑–∞—Å—Ç–∞–≤–∏—Ç—å –≤—Å–µ...,video,16.0,ntvnews,,,,ru
2934195,24254.0,2021-11-05 16:01:18+00:00,31116.0,,PeerChannel(channel_id=1038402501),,üñº –ò—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω—ã–π –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç –æ–±–Ω–∞–∂–∏–ª –ü–∏–∫–∞—Å—Å–æ –õ–æ–Ω...,photo,,kommersant,,,,ru
2405853,87674.0,2022-01-08 17:36:39+00:00,41459.0,,PeerChannel(channel_id=1036362176),,–í –ü–µ–Ω—Ç–∞–≥–æ–Ω–µ –≤ –ø—Ä–µ–¥–¥–≤–µ—Ä–∏–∏ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–æ–≤ —Å –†–æ—Å—Å–∏–µ–π...,text,,rt_russian,,,,ru
6823529,3543.0,2020-11-03 18:05:38+00:00,11083.0,,PeerChannel(channel_id=1492765963),,#MovsesGhazaryan –¥–ª—è –∫–∞–Ω–∞–ª–∞ üõë–ö–∞—Ä–Ω–∞—É—Ö–æ–≤ *** –ù–∞...,text,,sskarnaukhov,,,,ru
2500248,39447.0,2022-05-10 15:45:01+00:00,220496.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1100158992),,"""–°–ª—É–∂—É –†–æ—Å—Å–∏–∏!"" –í –•–µ—Ä—Å–æ–Ω—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –Ω–∞–≥—Ä–∞–¥–∏–ª...",video,79.0,shot_shot,,,,ru
6359965,33651.0,2022-04-06 12:08:56+00:00,888647.0,,PeerChannel(channel_id=1117628569),,–ï–≥–æ –ª–∏–±–µ—Ä–∞–ª—å–Ω–æ-–¥–µ–º–æ–∫—Ä–∞—Ç–∏—á–µ—Å–∫–∞—è –ø–∞—Ä—Ç–∏—è –ø–æ—è–≤–∏–ª–∞—Å...,video,82.0,breakingmash,,,,ru
332712,10367.0,2022-03-16 17:44:14+00:00,95233.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1171552896),,–°–®–ê –ø–µ—Ä–µ–¥–∞—Å—Ç –£–∫—Ä–∞–∏–Ω–µ 9 —Ç—ã—Å. –ø—Ä–æ—Ç–∏–≤–æ—Ç–∞–Ω–∫–æ–≤—ã—Ö –∫–æ...,text,,russ_orientalist,,,,ru
1929906,2859.0,2019-11-15 20:14:22+00:00,49905.0,,PeerChannel(channel_id=1167445017),,–°–¥–µ–ª–∫—É –ø–æ –°-400 —Å –ò–Ω–¥–∏–µ–π –º–Ω–æ–≥–∏–µ —Å–≤—è–∑—ã–≤–∞–ª–∏ —Å —Ç–µ...,photo,,politadequate,,,,ru
7804942,162430.0,2019-07-29 07:18:25+00:00,2.0,,PeerChannel(channel_id=1082084045),"MessageFwdHeader(date=datetime.datetime(2019, ...","–ü–æ –æ—Ü–µ–Ω–∫–∞–º —ç–∫—Å–ø–µ—Ä—Ç–æ–≤, –≤ –ì–µ—Ä–º–∞–Ω–∏–∏ –ø—Ä–∏–º–µ—Ä–Ω–æ 12 —Ç...",photo,,karaulny,,,,ru


In [16]:
test_dataset = ['test sentence 1', 'test sentence 2','test sentence 3']

In [17]:
with open('test_dataset.txt', 'w', encoding='utf8') as f:
    for line in test_dataset:
        f.write(f"{line}\n")

## Separating comments from posts

In [18]:
df_channels[df_channels['channel_name']=='Topaz_Govorit']

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,channel_name,frw_from_title,frw_from_name,msg_entity,lang
5847662,2762.0,2022-12-02 18:01:03+00:00,8538.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1321128351),,"–ü–æ–º–Ω–∏—Ç—Å—è, —á—Ç–æ –∑–∞ –º–µ—Å—è—Ü –¥–æ –Ω–∞—á–∞–ª–∞ –≤–æ–π–Ω—ã —Å –°–æ—Ñ–∏–π...",text,,Topaz_Govorit,,,,ru
5847663,980465.0,2022-12-02 18:01:55+00:00,,,PeerChannel(channel_id=1679205140),,–ù–∞ –±—É–º–∞–≥–µ –Ω–∏–∫–∞–∫–æ–π –≤–æ–π–Ω—ã –Ω–µ—Ç,text,,Topaz_Govorit,,,,ru
5847664,980466.0,2022-12-02 18:02:11+00:00,,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1679205140),,–¢–∞–∫ —ç—Ç–æ –∑–∞–∫–æ–Ω–æ–º–µ—Ä–Ω–æ—Å—Ç—å –í–° –†–§ –¥–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É—é—Ç –Ω–µ ...,text,,Topaz_Govorit,,,,ru
5847665,980467.0,2022-12-02 18:02:31+00:00,,,PeerChannel(channel_id=1679205140),,–∏ –ø—Ä–∏ –≤—Å—ë–º –ø—Ä–∏ —ç—Ç–æ–º —É–Ω–∏–∞—Ç—Å–∫–∞—è —Ü–µ—Ä–∫–æ–≤—å –≤—Å—ë –µ—â—ë ...,text,,Topaz_Govorit,,,,ru
5847666,980468.0,2022-12-02 18:02:37+00:00,,,PeerChannel(channel_id=1679205140),,–ò–Ω—Ç–µ—Ä–µ—Å–Ω–æ —á—Ç–æ —Å–∫–∞–∂–µ—Ç –ø–∞—Ç—Ä–∏–∞—Ä—Ö.,text,,Topaz_Govorit,,,,ru
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5860896,10.0,2019-07-29 16:08:49+00:00,1793.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1321128351),,–ù–∞–≤–µ—Ä–Ω–æ–µ –∫–∞–∂–¥—ã–π –Ω–∞ —Å–≤–æ—ë–º –ø—É—Ç–∏ –≤—Å—Ç—Ä–µ—á–∞–ª —Ç–∞–∫–æ–≥–æ ...,photo,,Topaz_Govorit,,,,ru
5860898,8.0,2019-07-29 15:31:10+00:00,1727.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1321128351),,–ê –µ—â—ë –∏–Ω–æ–≥–¥–∞ –±—É–¥—É—Ç —Å–º–µ—à–Ω—ã–µ –¥–ª—è –º–µ–Ω—è –º–µ–º—ã —Å–æ–±—Å—Ç...,text,,Topaz_Govorit,,,,ru
5860899,7.0,2019-07-29 15:15:42+00:00,1924.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1321128351),,"–ï—Å–ª–∏ —Ö–æ—Ç–∏—Ç–µ –∏–∑–º–µ–Ω–∏—Ç—å –º–∏—Ä –≤–æ–∫—Ä—É–≥ —Å–µ–±—è, –Ω–µ –Ω—É–∂–Ω–æ...",text,,Topaz_Govorit,,,,ru
5860900,6.0,2019-07-29 14:20:06+00:00,1935.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1321128351),,–ò–Ω—Ç–µ—Ä–µ—Å–Ω—É—é –≤–µ—â—å –∑–∞–º–µ—Ç–∏–ª: –∫–∞–∂–¥—ã–π —Ä–∞–∑ –∫–æ–≥–¥–∞ –≤ –ú–°...,text,,Topaz_Govorit,,,,ru


In [19]:
id_list = df_channels[['channel_name', 'to_id']].drop_duplicates(keep='first').drop_duplicates(subset=['channel_name'], keep='first').to_id.tolist()

In [20]:
df_channels['is_post'] = df_channels.apply(lambda x: True if x.to_id in id_list else False, axis=1)

In [21]:
print(f'{len(df_channels[df_channels["is_post"]==True])/len(df_channels)*100:.1f}% of messages are posts, the rest are comments')

97.4% of messages are posts, the rest are comments


In [22]:
df_comments = df_channels[df_channels['is_post']==False]

In [23]:
df_posts = df_channels[df_channels['is_post']==True]

In [24]:
# df_posts.type = pd.api.types.CategoricalDtype(categories=df_posts.type.unique().tolist(), ordered = False)

In [25]:
# df_posts.type = pd.api.types.CategoricalDtype(categories=df_posts.type.unique().tolist(), ordered = False)


In [26]:
# df_posts.lang = pd.api.types.CategoricalDtype(categories=df_posts.lang.unique().tolist(), ordered = False)

In [27]:
# df_posts.to_csv('only_posts.csv', index = False, header=True)

In [28]:
df_channels = df_channels[df_channels['is_post']==True]

In [29]:
df_channels.channel_name.unique()

array(['mardanaka', 'rian_ru', 'tvrain', 'krispotupchik', 'akimapachev',
       'go338', 'KotNaMirotvorze', 'emphasises', 'russ_orientalist',
       'pravda_shuravi', 'voenacher', 'n_zackhaim', 'Hinshtein',
       'kashinguru', 'lentachold', 'wargonzo', 'foxandraven',
       'madam_secretar', 'russianfuture', 'vchkogpu', 'rusich_army',
       'botcharov', 'mashmoyka', 'vv_volodin', 'Mikle1On',
       'kremlinprachka', 'ctrs2018', 'mediazzzona', 'sotaproject',
       'znachit_net', 'er_molnia', 'razvedkavperedZ', 'rasstrelny',
       'govoritfursov', 'gramotyyaroslava', 'informnapalm',
       'podosokorsky', 'mosnow', 'daokedao', 'chtddd', 'nevzorovtv',
       'SonOfMonarchy', 'lesyaryabtseva', 'fontankaspb', 'tass_agency',
       'ErnestV_2020', 'sorok40russia', 'odinokayakoko', 'pgubarev',
       'swodki', 'meduzalive', 'Gori_spb', 'anna_news', 'readovkanews',
       'umar_kremlev', 'government_rus', 'Alekhin_Telega', 'leylinurimm',
       'pushilindenis', 'strelets_molodec', 'istrkal

## Removing breaks and repr symbols

In [30]:
test_string = df_channels[df_channels['is_post']==True].message[0]
test_string

'–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å  https://www.youtube.com/watch?v=4L7T3u7utSw'

In [31]:
splitters = ['\n', '\t', '\\n','\xa0', '\u200b']

In [32]:
df_channels['message_no_breaks'] = df_channels['message'].str.replace('\n|\t|\\n', ' ', case=False)

  df_channels['message_no_breaks'] = df_channels['message'].str.replace('\n|\t|\\n', ' ', case=False)


## Isolating emojis

In [33]:
def get_emoji_count(text):
    return collections.Counter([match["emoji"] for word in text for match in emoji.emoji_list(word)])

In [34]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

In [35]:
%%time
df_channels['emojis'] = df_channels['message'].apply(lambda x: get_emoji_count(x))

CPU times: user 20min 26s, sys: 33.2 s, total: 20min 59s
Wall time: 24min 20s


In [36]:
%%time
df_channels['message_no_emoji'] = df_channels['message_no_breaks'].apply(lambda x: remove_emoji(x))

CPU times: user 46.4 s, sys: 28.6 s, total: 1min 15s
Wall time: 2min 48s


In [37]:
df_channels['message_no_emoji'].sample(10).iloc[0]

'–ö–∏—Ç–∞–π –≤–æ–∑–º—É—â–µ–Ω ¬´–±–µ—Å–ø—Ä–µ–¥–µ–ª–æ–º¬ª –∫–æ–º–ø–∞–Ω–∏–∏ –ò–ª–æ–Ω–∞ –ú–∞—Å–∫–∞ SpaceX –≤ –æ—Å–≤–æ–µ–Ω–∏–∏ –∫–æ—Å–º–∏—á–µ—Å–∫–æ–≥–æ –ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–∞: –≤ –¥–æ–∫–ª–∞–¥–µ, –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–Ω–æ–º –ü–µ–∫–∏–Ω–æ–º –ö–æ–º–∏—Ç–µ—Ç—É –û–û–ù –ø–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—é –∫–æ—Å–º–∏—á–µ—Å–∫–æ–≥–æ –ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–∞ –≤ –º–∏—Ä–Ω—ã—Ö —Ü–µ–ª—è—Ö –≤ –Ω–∞—á–∞–ª–µ —ç—Ç–æ–≥–æ –º–µ—Å—è—Ü–∞, –ö–∏—Ç–∞–π –∑–∞—è–≤–∏–ª, —á—Ç–æ –µ–≥–æ –∫–æ—Å–º–∏—á–µ—Å–∫–∞—è —Å—Ç–∞–Ω—Ü–∏—è –±—ã–ª–∞ –≤—ã–Ω—É–∂–¥–µ–Ω–∞ –¥–≤–∞–∂–¥—ã ‚Äì –≤ –∏—é–ª–µ –∏ –æ–∫—Ç—è–±—Ä–µ 2021 –≥–æ–¥–∞ - –º–µ–Ω—è—Ç—å –æ—Ä–±–∏—Ç–∞–ª—å–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã  —Å–≤–æ–µ–π –∫–æ—Å–º–∏—á–µ—Å–∫–æ–π —Å—Ç–∞–Ω—Ü–∏–∏, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å —Å—Ç–æ–ª–∫–Ω–æ–≤–µ–Ω–∏—è —Å–æ —Å–ø—É—Ç–Ω–∏–∫–∞–º–∏ Starlink.  SpaceX —É–∂–µ –∑–∞–ø—É—Å—Ç–∏–ª–∞ –≤ –∫–æ—Å–º–æ—Å –±–æ–ª–µ–µ 1900 —Å–ø—É—Ç–Ω–∏–∫–æ–≤ –¥–ª—è —Å–æ–∑–¥–∞–Ω–∏—è –æ—Ä–±–∏—Ç–∞–ª—å–Ω–æ–π –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∏ –∫–æ—Å–º–∏—á–µ—Å–∫–æ–π —Ç–µ–ª–µ–∫–æ–º–º—É–Ω–∏–∫–∞—Ü–∏–æ–Ω–Ω–æ–π —Å–µ—Ç–∏

## Isolating urls

In [38]:
def find_urls(text):
    try:
        links =  re.findall(r'(https?://[^\s]+)', text)
        return [urlparse(item).netloc for item in links]
    except ValueError:
        return []

In [39]:
df_channels['url_list'] = df_channels['message_no_emoji'].apply(lambda x: find_urls(x))

In [40]:
df_channels['message_no_urls'] = df_channels['message_no_emoji'].str.replace('http\S+|www.\S+', '', case=False)

  df_channels['message_no_urls'] = df_channels['message_no_emoji'].str.replace('http\S+|www.\S+', '', case=False)


In [41]:
#bs check
(df_channels['message_no_urls']
 .apply(lambda x: find_urls(x))
 .value_counts())

[]    6836544
Name: message_no_urls, dtype: int64

In [42]:
df_channels['message_no_urls'].sample(10).iloc[0]

'–í –∫–∞–∑–∞–Ω—Å–∫–æ–º —Ü–∏—Ä–∫–µ —Å–ª–æ–Ω—ã —É—Å—Ç—Ä–æ–∏–ª–∏ –¥—Ä–∞–∫—É –≤–æ –≤—Ä–µ–º—è –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏—è.  –ù–∞ –∫–∞–¥—Ä–∞—Ö –∏–∑ —Å–æ—Ü—Å–µ—Ç–µ–π –≤–∏–¥–Ω–æ, –∫–∞–∫ –ø–æ—Å–µ—Ç–∏—Ç–µ–ª–∏ —Ä–∞–∑–±–µ–≥–∞—é—Ç—Å—è —Å–æ –∑—Ä–∏—Ç–µ–ª—å—Å–∫–∏—Ö –º–µ—Å—Ç. –ü–æ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω—ã–º –¥–∞–Ω–Ω—ã–º, –Ω–∏–∫—Ç–æ –Ω–µ –ø–æ—Å—Ç—Ä–∞–¥–∞–ª.  –ù–∞ —Å–∞–π—Ç–µ —Ü–∏—Ä–∫–∞ –æ–ø—É–±–ª–∏–∫–æ–≤–∞–Ω–æ —Å–æ–æ–±—â–µ–Ω–∏–µ, —á—Ç–æ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ –±—ã–ª–æ –æ—Ç–º–µ–Ω–µ–Ω–æ ¬´–ø–æ —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–º –ø—Ä–∏—á–∏–Ω–∞–º¬ª'

## Extract mentions

In [43]:
df_channels['mentions'] = df_channels['message_no_urls'].str.findall(r"@([a-zA-Z0-9_]{1,50})")

In [44]:
df_channels[['mentions','message_no_urls']].sample(10)
# df_channels[['mentions','message_no_urls']].message_no_urls.iloc[1265478]

Unnamed: 0,mentions,message_no_urls
155671,[],–¢–µ—Ä—Ä–æ—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∞—è –∞—Ç–∞–∫–∞ –Ω–∞ –ø–∞–∫–∏—Å—Ç–∞–Ω—Å–∫—É—é –±–∏—Ä–∂—É....
5248504,[],"–£—Ç—Ä–µ–Ω–Ω—è—è –ø–æ–¥–±–æ—Ä–∫–∞ –Ω–æ–≤–æ—Å—Ç–µ–π, —á—Ç–æ–±—ã –Ω–∞—á–∞—Ç—å –¥–µ–Ω—å ..."
3968835,[],¬´–ò –≤–µ—Å—å –º–∏—Ä –≤ —è–¥–µ—Ä–Ω—ã–π –ø–µ–ø–µ–ª¬ª: –ö–∏–º –ß–µ–Ω –´–Ω –æ–±—Å—É–¥...
5100258,[],–ò–∑—å—è—Ç—ã–π —É –∫—Ä–∏–º–∏–Ω–∞–ª–∞ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–≥–æ –≥–æ—Ä–æ–¥–∞ –ü–æ–∫—Ä–æ–≤—Å...
6926988,[],–ü–æ—Å–ª–µ–¥–Ω–∏–µ –¥–∞–Ω–Ω—ã–µ –æ —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω–µ–Ω–∏–∏ –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å...
1400945,[],–ü–æ –õ–∏–º–∞–Ω—É –Ω–∞–Ω–µ—Å–µ–Ω–æ –Ω–µ –º–µ–Ω–µ–µ –ø—è—Ç–∏ –∞–≤–∏–∞—É–¥–∞—Ä–æ–≤. –†...
2916705,[kommersant],–°–®–ê —É—Å–∫–æ—Ä–∏–ª–∏ —Ä–∞–∑–≤–µ—Ä—Ç—ã–≤–∞–Ω–∏–µ –Ω–∞ –±–∞–∑–∞—Ö –ù–ê–¢–û –≤ –ï–≤—Ä...
6816144,"[sputnikKZ, sputnikKZ]","–£–∫—Ä–∞–∏–Ω—Å–∫–∏–µ —Å—Ü–µ–Ω–∞—Ä–∏—Å—Ç—ã ""—è–∑—ã–∫–æ–≤—ã—Ö —Ä–µ–π–¥–æ–≤"" –≤ –ö–∞–∑–∞..."
6879108,[],–ù–∞–∑–≤–∞–Ω–∞ –¥–∞—Ç–∞ –ø–æ—Ö–æ—Ä–æ–Ω –°–µ—Ä–≥–µ—è –°–æ–ª–æ–≤—å—ë–≤–∞ –ü–æ–≥—Ä–µ–±–µ...
1122307,[],–ó–∞–º–µ—Å—Ç–∏—Ç–µ–ª—å –º–∏–Ω–∏—Å—Ç—Ä–∞ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö –¥–µ–ª –†–æ—Å—Å–∏–∏ –°–µ...


In [45]:
# text_tst = df_channels[['mentions','message_no_urls']].message_no_urls.iloc[1265478]
# m_list = df_channels[['mentions','message_no_urls']].mentions.iloc[1265478]

In [46]:
def cut_mentions(list_of_mentions, text):
    for word in list_of_mentions:
        wrd = '@' + word
        text = text.replace(wrd, '')
    return text.strip()

In [47]:
%%time
df_channels['message_no_mentions'] = df_channels.apply(lambda x: cut_mentions(x['mentions'], x['message_no_urls']), axis=1)

CPU times: user 36.6 s, sys: 4min 7s, total: 4min 44s
Wall time: 24min 54s


In [50]:
mentions_col_list = df_channels[df_channels['mentions'].map(lambda d: len(d)) > 0]['mentions'].tolist()
mentions_list = [item for sublist in mentions_col_list for item in sublist]
mentions_unique = list(set(mentions_list))

In [51]:
print(f'We have {len(mentions_unique)} unique channel names among {len(mentions_list)} mentions for dataset')

We have 24789 unique channel names among 1454907 mentions for dataset


In [52]:
dict = {"names": mentions_list}
json_object = json.dumps(dict, indent=4)
with open("/Users/katerynaburovova/PycharmProjects/dehumanization/data/names_from_mentions.json", "w") as outfile:
    outfile.write(json_object)

## Unify and isolate quotes

In [55]:
to_replace = ["¬´", "¬ª", "‚Äú"]

df_channels['message_no_mentions'] = df_channels['message_no_mentions'].str.replace('|'.join(to_replace),'"')

  df_channels['message_no_mentions'] = df_channels['message_no_mentions'].str.replace('|'.join(to_replace),'"')


In [56]:
%%time
df_channels['quotes'] = df_channels['message_no_mentions'].str.findall(r'"(.*?)"')

CPU times: user 9.43 s, sys: 41.8 s, total: 51.2 s
Wall time: 4min 5s


## Isolate hashtags

In [57]:
%%time
# df_channels['hashtags'] = df_channels['message_no_mentions'].str.findall("#[A-Za-z0-9_]+")
df_channels['hashtags'] = df_channels['message_no_mentions'].apply(lambda x: {word.strip(",").strip(".") for word in x.split() if word.startswith("#")})

CPU times: user 46.4 s, sys: 34.9 s, total: 1min 21s
Wall time: 4min 7s


In [59]:
def cut_hasgtags(set_of_hashtags, text):
    for word in list(set_of_hashtags):
        text = text.replace(word, '')
    return text.strip()

In [62]:
# df_channels.to_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/datasets/df_channels_before_sym_cleanup.csv')

In [None]:
df_channels['message_no_hashtags'] = df_channels.apply(lambda x: cut_hasgtags(x['hashtags'], x['message_no_mentions']), axis=1)

In [1]:
import pandas as pd
df_channels = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/datasets/df_channels_before_sym_cleanup.csv')

  df_channels = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/datasets/df_channels_before_sym_cleanup.csv')


In [3]:
df_channels.sample(5)

Unnamed: 0.1,Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,...,message_no_breaks,emojis,message_no_emoji,url_list,message_no_urls,mentions,message_no_mentions,quotes,hashtags,message_no_hashtags
2848815,3437927,19002.0,2018-12-26 21:03:03+00:00,2299.0,,PeerChannel(channel_id=1071040207),,–ù—É —á—Ç–æ —Å –Ω–∞–º–∏ —Å–¥–µ–ª–∞–ª–∏ –µ—Å–ª–∏ –¥–∞–∂–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–∞—Ä–æ...,text,,...,–ù—É —á—Ç–æ —Å –Ω–∞–º–∏ —Å–¥–µ–ª–∞–ª–∏ –µ—Å–ª–∏ –¥–∞–∂–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–∞—Ä–æ...,Counter(),–ù—É —á—Ç–æ —Å –Ω–∞–º–∏ —Å–¥–µ–ª–∞–ª–∏ –µ—Å–ª–∏ –¥–∞–∂–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–∞—Ä–æ...,[],–ù—É —á—Ç–æ —Å –Ω–∞–º–∏ —Å–¥–µ–ª–∞–ª–∏ –µ—Å–ª–∏ –¥–∞–∂–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–∞—Ä–æ...,[],–ù—É —á—Ç–æ —Å –Ω–∞–º–∏ —Å–¥–µ–ª–∞–ª–∏ –µ—Å–ª–∏ –¥–∞–∂–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–∞—Ä–æ...,[],set(),–ù—É —á—Ç–æ —Å –Ω–∞–º–∏ —Å–¥–µ–ª–∞–ª–∏ –µ—Å–ª–∏ –¥–∞–∂–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Å—Ç–∞—Ä–æ...
6086887,7184869,7339.0,2018-12-20 10:45:45+00:00,14655.0,,PeerChannel(channel_id=1109403194),,"–í–ª–∞–¥–∏–º–∏—Ä –í–ª–∞–¥–∏–º–∏—Ä–æ–≤–∏—á, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è, –Ω–µ —Ç–∞–∫ –ø–æ–Ω...",text,,...,"–í–ª–∞–¥–∏–º–∏—Ä –í–ª–∞–¥–∏–º–∏—Ä–æ–≤–∏—á, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è, –Ω–µ —Ç–∞–∫ –ø–æ–Ω...",Counter(),"–í–ª–∞–¥–∏–º–∏—Ä –í–ª–∞–¥–∏–º–∏—Ä–æ–≤–∏—á, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è, –Ω–µ —Ç–∞–∫ –ø–æ–Ω...",[],"–í–ª–∞–¥–∏–º–∏—Ä –í–ª–∞–¥–∏–º–∏—Ä–æ–≤–∏—á, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è, –Ω–µ —Ç–∞–∫ –ø–æ–Ω...",[],"–í–ª–∞–¥–∏–º–∏—Ä –í–ª–∞–¥–∏–º–∏—Ä–æ–≤–∏—á, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è, –Ω–µ —Ç–∞–∫ –ø–æ–Ω...",[],set(),"–í–ª–∞–¥–∏–º–∏—Ä –í–ª–∞–¥–∏–º–∏—Ä–æ–≤–∏—á, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è, –Ω–µ —Ç–∞–∫ –ø–æ–Ω..."
1036036,1230819,10020.0,2021-05-25 19:33:54+00:00,6257.0,,PeerChannel(channel_id=1433731512),,–°–∫–æ–ª—å–∫–æ –≤—Å—è–∫–æ–π —Ö–µ—Ä–Ω–∏ –ø–æ–Ω–∞–ø–∏—Å–∞–ª–∏,text,,...,–°–∫–æ–ª—å–∫–æ –≤—Å—è–∫–æ–π —Ö–µ—Ä–Ω–∏ –ø–æ–Ω–∞–ø–∏—Å–∞–ª–∏,Counter(),–°–∫–æ–ª—å–∫–æ –≤—Å—è–∫–æ–π —Ö–µ—Ä–Ω–∏ –ø–æ–Ω–∞–ø–∏—Å–∞–ª–∏,[],–°–∫–æ–ª—å–∫–æ –≤—Å—è–∫–æ–π —Ö–µ—Ä–Ω–∏ –ø–æ–Ω–∞–ø–∏—Å–∞–ª–∏,[],–°–∫–æ–ª—å–∫–æ –≤—Å—è–∫–æ–π —Ö–µ—Ä–Ω–∏ –ø–æ–Ω–∞–ø–∏—Å–∞–ª–∏,[],set(),–°–∫–æ–ª—å–∫–æ –≤—Å—è–∫–æ–π —Ö–µ—Ä–Ω–∏ –ø–æ–Ω–∞–ø–∏—Å–∞–ª–∏
640616,745859,560.0,2019-12-11 16:04:06+00:00,14966.0,,PeerChannel(channel_id=1079904535),,–í—ã –Ω–∞–≤–µ—Ä–Ω—è–∫–∞ —Å–ª—ã—à–∞–ª–∏ –ø—Ä–æ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–∫—É –í–µ—Ä—É –ê—Ñ–∞–Ω...,text,,...,–í—ã –Ω–∞–≤–µ—Ä–Ω—è–∫–∞ —Å–ª—ã—à–∞–ª–∏ –ø—Ä–æ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–∫—É –í–µ—Ä—É –ê—Ñ–∞–Ω...,Counter(),–í—ã –Ω–∞–≤–µ—Ä–Ω—è–∫–∞ —Å–ª—ã—à–∞–ª–∏ –ø—Ä–æ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–∫—É –í–µ—Ä—É –ê—Ñ–∞–Ω...,[],–í—ã –Ω–∞–≤–µ—Ä–Ω—è–∫–∞ —Å–ª—ã—à–∞–ª–∏ –ø—Ä–æ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–∫—É –í–µ—Ä—É –ê—Ñ–∞–Ω...,"['veraafanasyeva', 'veraafanasyeva']",–í—ã –Ω–∞–≤–µ—Ä–Ω—è–∫–∞ —Å–ª—ã—à–∞–ª–∏ –ø—Ä–æ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–∫—É –í–µ—Ä—É –ê—Ñ–∞–Ω...,[],set(),–í—ã –Ω–∞–≤–µ—Ä–Ω—è–∫–∞ —Å–ª—ã—à–∞–ª–∏ –ø—Ä–æ –ø—Ä–æ—Ñ–µ—Å—Å–æ—Ä–∫—É –í–µ—Ä—É –ê—Ñ–∞–Ω...
2067175,2538776,492.0,2017-08-15 15:04:38+00:00,4005.0,,PeerChannel(channel_id=1100158992),,üìπ –î–µ—Ä–µ–≤–æ —É–±–∏–ª–æ 11 —á–µ–ª–æ–≤–µ–∫ –Ω–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫–µ –ë–æ–≥–æ–º–∞—Ç...,video,37.0,...,üìπ –î–µ—Ä–µ–≤–æ —É–±–∏–ª–æ 11 —á–µ–ª–æ–≤–µ–∫ –Ω–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫–µ –ë–æ–≥–æ–º–∞—Ç...,Counter({'üìπ': 1}),–î–µ—Ä–µ–≤–æ —É–±–∏–ª–æ 11 —á–µ–ª–æ–≤–µ–∫ –Ω–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫–µ –ë–æ–≥–æ–º–∞—Ç...,[],–î–µ—Ä–µ–≤–æ —É–±–∏–ª–æ 11 —á–µ–ª–æ–≤–µ–∫ –Ω–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫–µ –ë–æ–≥–æ–º–∞—Ç...,[],–î–µ—Ä–µ–≤–æ —É–±–∏–ª–æ 11 —á–µ–ª–æ–≤–µ–∫ –Ω–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫–µ –ë–æ–≥–æ–º–∞—Ç–µ—Ä...,[],set(),–î–µ—Ä–µ–≤–æ —É–±–∏–ª–æ 11 —á–µ–ª–æ–≤–µ–∫ –Ω–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫–µ –ë–æ–≥–æ–º–∞—Ç–µ—Ä...


In [6]:
df_channels = df_channels.drop(columns=['views',
                                        'reactions',
                                        'to_id',
                                        'fwd_from',
                                        'type',
                                        'duration',
                                        'frw_from_title',
                                        'frw_from_name',
                                        'msg_entity',
                                        'is_post',
                                        'message_no_breaks',
                                        'emojis',
                                        'message_no_emoji',
                                        'url_list',
                                        'message_no_urls',
                                        'mentions',
                                        'message_no_mentions',
                                        'quotes',
                                        'hashtags',
                                        'Unnamed: 0'])

In [9]:
%%time
df_channels.columns

CPU times: user 8 ¬µs, sys: 51 ¬µs, total: 59 ¬µs
Wall time: 168 ¬µs


Index(['id', 'date', 'message', 'channel_name', 'lang', 'message_no_hashtags'], dtype='object')

In [8]:
# df_channels.to_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/datasets/df_channels_only_messages.csv')

## Remove everything except letters and punctuation

In [10]:
from string import punctuation

punctuation_minimal = "!(),-.:;?%"

cyrillic_letters = u"–∞–±–≤–≥–¥–µ—ë–∂–∑–∏–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—â—ä—ã—å—ç—é—è–ê–ë–í–ì–î–ï–Å–ñ–ó–ò–ô–ö–õ–ú–ù–û–ü–†–°–¢–£–§–•–¶–ß–®–©–™–´–¨–≠–Æ–Ø"
latin_letters_numbers = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "
allowed_symbols = cyrillic_letters+latin_letters_numbers+punctuation_minimal


In [11]:
allowed_symbols

'–∞–±–≤–≥–¥–µ—ë–∂–∑–∏–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—â—ä—ã—å—ç—é—è–ê–ë–í–ì–î–ï–Å–ñ–ó–ò–ô–ö–õ–ú–ù–û–ü–†–°–¢–£–§–•–¶–ß–®–©–™–´–¨–≠–Æ–Ø1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ !(),-.:;?%'

In [12]:
def clean_text(string, allowed_symbols=allowed_symbols):
    getVals = list(filter(lambda x: x in allowed_symbols, string))
    result = "".join(getVals)
    result = re.sub(' +', ' ', result)

    return result

In [13]:
df_channels['message_no_hashtags'] = df_channels['message_no_hashtags'].astype(str)

In [None]:
%%time
df_channels['message_clean'] = df_channels['message_no_hashtags'].apply(lambda x: clean_text(x))

In [None]:
print(f' We have {len(df_channels[df_channels["message_clean"]==""])} ({100*len(df_channels[df_channels["message_clean"]==""])/len(df_channels):2f}%) empty clean messages')

In [None]:
print(f' We have {len(df_channels[df_channels["message_clean"]!=""])} ({100*len(df_channels[df_channels["message_clean"]!=""])/len(df_channels):2f}%) meaningful clean messages')

In [None]:
only_letters = cyrillic_letters+latin_letters_numbers

In [None]:
%%time
df_channels['message_words_only_lower'] = df_channels['message_clean'].apply(lambda x: clean_text(x, allowed_symbols=only_letters).lower())

In [None]:
df_channels['message_words_only_lower'].sample(10).iloc[0]

### The latest common date

In [None]:
# Load the data into a pandas dataframe
df = pd.read_csv('telegram_posts.csv')

# Convert the date column to a datetime format
df['date'] = pd.to_datetime(df['date'])

# Group the dataframe by the channel column and find the minimum and maximum dates
grouped = df.groupby('channel')['date'].agg(['min', 'max'])

# Find the latest common post date
latest_common_post_date = grouped['max'].min()

print('The latest common post date is:', latest_common_post_date)


# Approaches

## Spacy RU model

In [None]:
import spacy
from spacy.lang.ru.examples import sentences

In [None]:
#python3 -m spacy download ru_core_news_md

In [None]:
# spacy.util.set_data_path("/Users/katerynaburovova/PycharmProjects/dehumanization/lib/python3.10/site-packages/ru_core_news_md")

In [None]:
nlp = spacy.load('ru_core_news_md')

In [81]:
def tokenize_spacy(text):
    doc = nlp(text)
    result = []
    for token in doc:
        result.append([token.text, token.pos_, token.dep_, token.lemma_, token.morph])
    return result

In [82]:
tokenize_spacy(sentences[0])

[['Apple',
  'PROPN',
  'nsubj',
  'apple',
  Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing],
 ['—Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç',
  'VERB',
  'ROOT',
  '—Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—Ç—å',
  Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act],
 ['–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å',
  'NOUN',
  'obj',
  '–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å',
  Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing],
 ['–ø–æ–∫—É–ø–∫–∏',
  'NOUN',
  'nmod',
  '–ø–æ–∫—É–ø–∫–∞',
  Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing],
 ['—Å—Ç–∞—Ä—Ç–∞–ø–∞',
  'NOUN',
  'nmod',
  '—Å—Ç–∞—Ä—Ç–∞–ø',
  Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing],
 ['–∏–∑', 'ADP', 'case', '–∏–∑', ],
 ['–°–æ–µ–¥–∏–Ω—ë–Ω–Ω–æ–≥–æ',
  'ADJ',
  'amod',
  '—Å–æ–µ–¥–∏–Ω—ë–Ω–Ω–æ–≥–æ',
  Case=Gen|Degree=Pos|Gender=Neut|Number=Sing],
 ['–ö–æ—Ä–æ–ª–µ–≤—Å—Ç–≤–∞',
  'PROPN',
  'nmod',
  '–∫–æ—Ä–æ–ª–µ–≤—Å—Ç–≤–æ',
  Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing],
 ['–∑–∞', 'ADP', 'case', '–∑–∞', ],
 ['$', 'SYM', 'nmod', '$', ],
 ['1', 'NUM', 'appos', '1', ],
 ['–º–ª

## RAZDEL tokenization

Handles "... - ... " as one word (even for composite ones)

In [226]:
# from razdel import tokenize
# def tokenize_razdel(text):
#     return(list(tokenize(text)))

## spacy_russian_tokenizer

https://github.com/aatimofeev/spacy_russian_tokenizer

In [342]:
# from spacy.lang.ru import Russian
# from spacy_russian_tokenizer import RussianTokenizer, MERGE_PATTERNS
# text = "–ù–µ –≤–µ—Ç–µ—Ä, –∞ –∫–∞–∫–æ–π-—Ç–æ —É—Ä–∞–≥–∞–Ω!"
# nlp = Russian()
# doc = nlp(text)
# russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
# nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
# doc = nlp(text)
# print([token.text for token in doc])

TypeError: add() takes exactly 2 positional arguments (339 given)

## lang-uk

[–Ω–µ —Å–∞–±–≤–æ—Ä–¥](https://github.com/lang-uk/ner-uk/blob/master/doc/tokenization.md)

In [83]:
# from tokenize_uk import tokenize_words
# tokenize_words(text)


## stanza


In [84]:
# import stanza
# stanza.download('ru')


In [85]:
# nlp_ru = stanza.Pipeline('ru', processors='tokenize')

In [86]:
# def get_stanza_tokens(text):
#     doc = nlp_ru(text)
#     word_tokens = [token.text for sent in doc.sentences for token in sent.tokens]
#     # result = [word for word in word_tokens if word not in stop_words_ru]
#     return word_tokens

## Subword (BPE?) BERT

Subword, but weird NER

In [87]:
# from transformers import AutoTokenizer, AutoModel

In [88]:
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

In [308]:
# # pip install transformers sentencepiece
# import torch
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# # model.cuda()  # uncomment it if you have a GPU
#
# def embed_bert_cls(text, model, tokenizer):
#     t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
#     with torch.no_grad():
#         model_output = model(**{k: v.to(model.device) for k, v in t.items()})
#     embeddings = model_output.last_hidden_state[:, 0, :]
#     embeddings = torch.nn.functional.normalize(embeddings)
#     return embeddings[0].cpu().numpy()
#
# print(embed_bert_cls('–ø—Ä–∏–≤–µ—Ç –º–∏—Ä', model, tokenizer).shape)
# # (312,)

In [348]:
# def tokens_bert_cls(text, model, tokenizer):
#     encoding = tokenizer.encode(text)
#     return tokenizer.convert_ids_to_tokens(encoding)
#
# tokens_bert_cls(test_string, model, tokenizer)
# # (312,)

['[CLS]',
 '[UNK]',
 '11',
 '-',
 '–≥–æ',
 '–ø–æ–ª–∫–∞',
 '–≥–æ–Ω',
 '##—è—é—Ç',
 '—Ç–∞–Ω–∫',
 '[UNK]',
 '11',
 '–ø–æ–ª–∫',
 '–ù',
 '##–ú',
 '–î–ù–†',
 '–ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç',
 '–Ω–∞–Ω–æ—Å–∏—Ç—å',
 '—É–¥–∞—Ä—ã',
 '–ø–æ',
 '–ø—Ä–æ—Ç–∏–≤–Ω–∏–∫—É',
 '–≤',
 '–ü–µ—Ä–≤–æ–º–∞–π',
 '##—Å–∫–æ–º',
 '.',
 '–ù–∞',
 '–≤–∏–¥–µ–æ',
 '—Ç–∞–Ω–∫–æ–≤—ã–π',
 '–±–∞—Ç–∞–ª—å–æ–Ω',
 '–ø–æ–¥',
 '–∫–æ–º–∞–Ω–¥–æ–≤–∞–Ω–∏–µ–º',
 '–°–µ–≤–µ—Ä–∞',
 '–±—å–µ—Ç',
 '–ø–æ',
 '–ø–æ–∑–∏—Ü–∏—è–º',
 '–í–°–£',
 ',',
 '–∑–∞–æ–¥–Ω–æ',
 '–∑–∞—Å—Ç–∞–≤–ª—è—è',
 '–º–µ—Ç–∞',
 '##—Ç—å—Å—è',
 '–∏',
 '—Å–ø–µ—à–Ω–æ',
 '—É–¥–∞–ª—è—Ç—å',
 '##—Å—è',
 '–ø–æ—è–≤–∏',
 '##–≤—à–∏–π—Å—è',
 '–Ω–∞',
 '—Å–≤–æ—é',
 '–±–µ–¥—É',
 '—É–∫—Ä–∞–∏–Ω—Å–∫–∏–π',
 '—Ç–∞–Ω–∫',
 '.',
 '@',
 'war',
 '##gon',
 '##zo',
 '*',
 '–Ω–∞—à',
 '–ø—Ä–æ–µ–∫—Ç',
 '—Å—É—â–µ—Å—Ç–≤—É–µ—Ç',
 '–Ω–∞',
 '—Å—Ä–µ–¥—Å—Ç–≤–∞',
 '–ø–æ–¥–ø–∏—Å—á–∏–∫–æ–≤',
 ',',
 '–∫–∞—Ä—Ç–∞',
 '–¥–ª—è',
 '–ø–æ–º–æ—â–∏',
 '427',
 '##9',
 '380',
 '##6',
 '984',
 '##2',
 '952',
 '##1',
 '[SEP]']

In [349]:
# tokens_bert_cls(text, model, tokenizer)

['[CLS]', '–ù–µ', '–≤–µ—Ç–µ—Ä', ',', '–∞', '–∫–∞–∫–æ–π', '-', '—Ç–æ', '—É—Ä–∞–≥–∞–Ω', '!', '[SEP]']

## Execution time

In [None]:
# df_test_sample = df_channels.sample(100000)

In [353]:
# %%time
# df_test_sample['spacy_tokens'] = df_test_sample['message'].apply(lambda x: tokenize_spacy(x))

CPU times: user 19.5 s, sys: 1.02 s, total: 20.5 s
Wall time: 21.1 s


In [354]:
# %%time
# df_test_sample['bert_tokens'] = df_test_sample['message'].apply(lambda x: tokens_bert_cls(x, model, tokenizer))

CPU times: user 13 s, sys: 331 ms, total: 13.3 s
Wall time: 13.5 s


In [355]:
# %%time
# df_test_sample['lang_uk_tokens'] = df_test_sample['message'].apply(lambda x: tokenize_words(x))

CPU times: user 1.91 s, sys: 75.3 ms, total: 1.99 s
Wall time: 2.02 s


In [356]:
# %%time
# df_test_sample['razdel_tokens'] = df_test_sample['message'].apply(lambda x: tokenize_razdel(x))

CPU times: user 13.8 s, sys: 2.08 s, total: 15.9 s
Wall time: 16.9 s


In [374]:
# %%time
# df_test_sample['stanza_tokens'] = df_test_sample['message'].sample(100).apply(lambda x: get_stanza_tokens(x))

CPU times: user 2.81 s, sys: 234 ms, total: 3.05 s
Wall time: 3.11 s


In [375]:
# %%time
# df_test_sample['stanza_tokens'] = df_test_sample['message'].apply(lambda x: get_stanza_tokens(x))

CPU times: user 40min 18s, sys: 2min 44s, total: 43min 2s
Wall time: 45min


In [384]:
# from random import randrange
#
# df_channels.message.iloc[randrange(len(df_channels))]

'–û—Ç–ø—Ä–∞–≤–∫–∞ –ø–æ —ç–ª–µ–∫—Ç—Ä–æ–Ω–Ω–æ–π –ø–æ—á—Ç–µ 2_5325905093675979275.pdf'

In [401]:
# rnd_text = df_channels.message.iloc[randrange(len(df_channels))]
# print(rnd_text)
#
# tokenize_spacy(rnd_text)

–°–æ–æ–±—â–∞–µ—Ç—Å—è –æ —Ä–∞–Ω–µ–Ω—ã—Ö –≤ —Ä—è–¥–∞—Ö —Ç—É—Ä–µ—Ü–∫–∏—Ö —Å–∏–ª –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ –ø–æ–¥—Ä—ã–≤–∞ –º–∏–Ω—ã –Ω–∞ —Ç—Ä–∞—Å—Å–µ –ú4 –≤ –ø—Ä–æ–≤–∏–Ω—Ü–∏–∏ –ò–¥–ª–∏–±. –¢—É—Ä–µ—Ü–∫–∏–µ –≤–µ—Ä—Ç–æ–ª–µ—Ç—ã –Ω–∞–ø—Ä–∞–≤–ª—è—é—Ç—Å—è –∫ –º–µ—Å—Ç—É –≤–∑—Ä—ã–≤–∞ –¥–ª—è —ç–≤–∞–∫—É–∞—Ü–∏–∏ —Ä–∞–Ω–µ–Ω—ã—Ö.


[['–°–æ–æ–±—â–∞–µ—Ç—Å—è', 'VERB', 'ROOT', '—Å–æ–æ–±—â–∞—Ç—å—Å—è'],
 ['–æ', 'ADP', 'case', '–æ'],
 ['—Ä–∞–Ω–µ–Ω—ã—Ö', 'NOUN', 'obl', '—Ä–∞–Ω–µ–Ω—ã–π'],
 ['–≤', 'ADP', 'case', '–≤'],
 ['—Ä—è–¥–∞—Ö', 'NOUN', 'nmod', '—Ä—è–¥'],
 ['—Ç—É—Ä–µ—Ü–∫–∏—Ö', 'ADJ', 'amod', '—Ç—É—Ä–µ—Ü–∫–∏–π'],
 ['—Å–∏–ª', 'NOUN', 'nmod', '—Å–∏–ª–∞'],
 ['–≤', 'ADP', 'case', '–≤'],
 ['—Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ', 'NOUN', 'obl', '—Ä–µ–∑—É–ª—å—Ç–∞—Ç'],
 ['–ø–æ–¥—Ä—ã–≤–∞', 'NOUN', 'nmod', '–ø–æ–¥—Ä—ã–≤'],
 ['–º–∏–Ω—ã', 'NOUN', 'nmod', '–º–∏–Ω–∞'],
 ['–Ω–∞', 'ADP', 'case', '–Ω–∞'],
 ['—Ç—Ä–∞—Å—Å–µ', 'NOUN', 'nmod', '—Ç—Ä–∞—Å—Å–∞'],
 ['–ú4', 'PROPN', 'appos', '–º4'],
 ['–≤', 'ADP', 'case', '–≤'],
 ['–ø—Ä–æ–≤–∏–Ω—Ü–∏–∏', 'NOUN', 'nmod', '–ø—Ä–æ–≤–∏–Ω—Ü–∏—è'],
 ['–ò–¥–ª–∏–±', 'PROPN', 'appos', '–∏–¥–ª–∏–±'],
 ['.', 'PUNCT', 'punct', '.'],
 ['–¢—É—Ä–µ—Ü–∫–∏–µ', 'ADJ', 'amod', '—Ç—É—Ä–µ—Ü–∫–∏–π'],
 ['–≤–µ—Ä—Ç–æ–ª–µ—Ç—ã', 'NOUN', 'nsubj', '–≤–µ—Ä—Ç–æ–ª—ë—Ç'],
 ['–Ω–∞–ø—Ä–∞–≤–ª—è—é—Ç—Å—è', 'VERB', 'ROOT', '–Ω–∞–ø—Ä–∞–≤–ª—è—Ç—å—Å—è'],

# Tokenization and lemmatization

In [89]:
def tokenize_spacy(text):
    doc = nlp(text)
    result = []
    for token in doc:
        result.append([token.text, token.pos_, token.dep_, token.lemma_, token.morph])
    return result

In [114]:
def get_lemmas(text):
    doc = nlp(text)
    result = []
    for token in doc:
        result.append([token.lemma_])
    return result

In [104]:
df_sample = df_channels.sample(1000)

In [1]:
%%time
df_sample['tokens'] = df_sample['message_words_only_lower'].apply(lambda x: tokenize_spacy(x))

NameError: name 'df_sample' is not defined

In [108]:
df_sample['message_words_only_lower'].iloc[536]

'–ø—Ä–µ–∑–∏–¥–µ–Ω—Ç —Ä–æ—Å—Å–∏–∏ –≤–ª–∞–¥–∏–º–∏—Ä –ø—É—Ç–∏–Ω –Ω–∞–≥—Ä–∞–¥–∏–ª –±–æ—Ä–∏—Å–∞ –≥—Ä—ã–∑–ª–æ–≤–∞ –æ—Ä–¥–µ–Ω–æ–º –∑–∞ –∑–∞—Å–ª—É–≥–∏ –ø–µ—Ä–µ–¥ –æ—Ç–µ—á–µ—Å—Ç–≤–æ–º i —Å—Ç–µ–ø–µ–Ω–∏ –æ—Ä–¥–µ–Ω –≤—Ä—É—á–µ–Ω –∑–∞ –≤—ã–¥–∞—é—â–∏–µ—Å—è –∑–∞—Å–ª—É–≥–∏ –≤ —É–∫—Ä–µ–ø–ª–µ–Ω–∏–∏ —Ä–æ—Å—Å–∏–π—Å–∫–æ–π –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ –∏ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ –≤–Ω–µ—à–Ω–µ–ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–æ–≥–æ –∫—É—Ä—Å–∞ —Å—Ç—Ä–∞–Ω—ã —Ç–∞–∫–∂–µ –æ—Å–Ω–æ–≤–∞–Ω–∏–µ–º –¥–ª—è –Ω–∞–≥—Ä–∞–¥—ã –ø–æ—Å–ª—É–∂–∏–ª–∏ –µ–≥–æ –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω—ã–π –≤–∫–ª–∞–¥ –≤ —Ä–∞–∑–≤–∏—Ç–∏–µ –æ–±–æ—Ä–æ–Ω–Ω–æ–ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ–≥–æ –∫–æ–º–ø–ª–µ–∫—Å–∞ —Ä–æ—Å—Å–∏–∏ –∏ –º–Ω–æ–≥–æ–ª–µ—Ç–Ω—è—è –¥–æ–±—Ä–æ—Å–æ–≤–µ—Å—Ç–Ω–∞—è —Ä–∞–±–æ—Ç–∞'

In [109]:
df_sample['tokens'].iloc[536]

[['–ø—Ä–µ–∑–∏–¥–µ–Ω—Ç',
  'NOUN',
  'nsubj',
  '–ø—Ä–µ–∑–∏–¥–µ–Ω—Ç',
  Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing],
 ['—Ä–æ—Å—Å–∏–∏',
  'PROPN',
  'nmod',
  '—Ä–æ—Å—Å–∏—è',
  Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing],
 ['–≤–ª–∞–¥–∏–º–∏—Ä',
  'PROPN',
  'appos',
  '–≤–ª–∞–¥–∏–º–∏—Ä',
  Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing],
 ['–ø—É—Ç–∏–Ω',
  'PROPN',
  'flat:name',
  '–ø—É—Ç–∏–Ω',
  Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing],
 ['–Ω–∞–≥—Ä–∞–¥–∏–ª',
  'VERB',
  'ROOT',
  '–Ω–∞–≥—Ä–∞–¥–∏—Ç—å',
  Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act],
 ['–±–æ—Ä–∏—Å–∞',
  'PROPN',
  'obj',
  '–±–æ—Ä–∏—Å',
  Animacy=Anim|Case=Acc|Gender=Masc|Number=Sing],
 ['–≥—Ä—ã–∑–ª–æ–≤–∞',
  'PROPN',
  'flat:name',
  '–≥—Ä—ã–∑–ª–æ–≤',
  Animacy=Anim|Case=Acc|Gender=Masc|Number=Sing],
 ['–æ—Ä–¥–µ–Ω–æ–º',
  'NOUN',
  'xcomp',
  '–æ—Ä–¥–µ–Ω',
  Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing],
 ['–∑–∞', 'ADP', 'case', '–∑–∞', ],
 ['–∑–∞—Å–ª—É–≥–∏',
  'NOUN',
  'obl'

In [115]:
%%time
df_sample['lemmas'] = df_sample['message_words_only_lower'].apply(lambda x: get_lemmas(x))

CPU times: user 16.8 s, sys: 433 ms, total: 17.2 s
Wall time: 19.1 s


In [126]:
df_sample['tokens'].iloc[536]


[['–∫–∞–∫', 'SCONJ', 'mark', '–∫–∞–∫', ],
 ['–ø–æ–∫–∞–∑–∞–ª–æ',
  'VERB',
  'parataxis',
  '–ø–æ–∫–∞–∑–∞—Ç—å',
  Aspect=Perf|Gender=Neut|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act],
 ['–∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ',
  'NOUN',
  'nsubj',
  '–∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ',
  Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing],
 ['statista', 'X', 'appos', 'statista', Foreign=Yes],
 ['global', 'X', 'flat:foreign', 'global', Foreign=Yes],
 ['consumer', 'X', 'flat:foreign', 'consumer', Foreign=Yes],
 ['survey', 'X', 'flat:foreign', 'survey', Foreign=Yes],
 ['–ø–æ—á—Ç–∏', 'ADV', 'advmod', '–ø–æ—á—Ç–∏', Degree=Pos],
 ['–∫–∞–∂–¥—ã–π', 'DET', 'det', '–∫–∞–∂–¥—ã–π', Case=Nom|Gender=Masc|Number=Sing],
 ['–ø—è—Ç—ã–π',
  'ADJ',
  'amod',
  '–ø—è—Ç—ã–π',
  Case=Nom|Degree=Pos|Gender=Masc|Number=Sing],
 ['—Ä–µ—Å–ø–æ–Ω–¥–µ–Ω—Ç',
  'NOUN',
  'nsubj',
  '—Ä–µ—Å–ø–æ–Ω–¥–µ–Ω—Ç',
  Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing],
 ['–∏–∑', 'ADP', 'case', '–∏–∑', ],
 ['–≤–µ–ª–∏–∫–æ–±—Ä–∏—Ç–∞–Ω–∏–∏',
  'PROPN',


In [124]:
df_sample['lemmas'].iloc[45]

[['–∫–∞–∫'],
 ['–ø–æ–∫–∞–∑–∞—Ç—å'],
 ['–∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ'],
 ['statista'],
 ['global'],
 ['consumer'],
 ['survey'],
 ['–ø–æ—á—Ç–∏'],
 ['–∫–∞–∂–¥—ã–π'],
 ['–ø—è—Ç—ã–π'],
 ['—Ä–µ—Å–ø–æ–Ω–¥–µ–Ω—Ç'],
 ['–∏–∑'],
 ['–≤–µ–ª–∏–∫–æ–±—Ä–∏—Ç–∞–Ω–∏—è'],
 ['18'],
 ['–æ–±—ã—á–Ω–æ'],
 ['—Å—Ç—Ä–∞–¥–∞—Ç—å'],
 ['–æ—Ç'],
 ['–ø–æ—Ö–º–µ–ª–∏–µ'],
 ['–Ω–∞'],
 ['—Å–ª–µ–¥—É—é—â–∏–π'],
 ['—É—Ç—Ä–æ'],
 ['–ø–æ—Å–ª–µ'],
 ['–ø—Ä–∏—ë–º'],
 ['–∞–ª–∫–æ–≥–æ–ª—å'],
 ['—Ç–∞–∫–æ–π'],
 ['–æ–±—Ä–∞–∑'],
 ['–±—Ä–∏—Ç–∞–Ω–µ—Ü'],
 ['–æ–∫–∞–∑–∞—Ç—å—Å—è'],
 ['–Ω–∞–∏–±–æ–ª–µ–µ'],
 ['–ø–æ–¥–≤–µ—Ä–∂–µ–Ω–Ω—ã–π'],
 ['—ç—Ç–æ—Ç'],
 ['–Ω–µ–¥—É–≥'],
 ['—Å—Ä–µ–¥–∏'],
 ['–∂–∏—Ç–µ–ª—å'],
 ['–≤–æ—Å–µ–º—å'],
 ['—Å—Ç—Ä–∞–Ω–∞'],
 ['—É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å'],
 ['–≤'],
 ['–∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ']]

In [125]:
df_sample['message_words_only_lower'].iloc[56]

'–±–ª—É–º–±–µ—Ä–≥ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ü–∏—è –±–∞–π–¥–µ–Ω–∞ –ø–æ–¥—Ç–∞–ª–∫–∏–≤–∞–µ—Ç —Å–æ—é–∑–Ω–∏–∫–æ–≤ –µ—Å –∫ –∑–∞–≤–µ—Ä—à–µ–Ω–∏—é —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ —à–∏—Ä–æ–∫–æ–≥–æ –ø–∞–∫–µ—Ç–∞ —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö –±–∞–Ω–∫–æ–≤ –∏ —ç–Ω–µ—Ä–≥–µ—Ç–∏—á–µ—Å–∫–∏—Ö –∫–æ–º–ø–∞–Ω–∏–π –∫–æ—Ç–æ—Ä—ã–µ –º–æ–≥—É—Ç –±—ã—Ç—å –≤–≤–µ–¥–µ–Ω—ã —Å–æ–≤–º–µ—Å—Ç–Ω–æ —Å —Å—à–∞ –µ—Å–ª–∏ –∫—Ä–µ–º–ª—å –Ω–∞–ø–∞–¥–µ—Ç –Ω–∞ —É–∫—Ä–∞–∏–Ω—É –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã–µ —à–∞–≥–∏ –∏–∑ —Å–ø–∏—Å–∫–∞ —Ç–∞–∫–∏–µ –∫–∞–∫ –æ—Ç–∫–ª—é—á–µ–Ω–∏–µ —Ä–æ—Å—Å–∏–∏ –æ—Ç –ø–ª–∞—Ç–µ–∂–Ω–æ–π —Å–∏—Å—Ç–µ–º—ã swift —Å—á–∏—Ç–∞—é—Ç—Å—è –∫—Ä–∞–π–Ω–µ –ø—Ä–æ–±–ª–µ–º–∞—Ç–∏—á–Ω—ã–º–∏ —Å–∫–∞–∑–∞–ª–∏ —Å–æ–±–µ—Å–µ–¥–Ω–∏–∫–∏ –∞–≥–µ–Ω—Ç—Å—Ç–≤–∞ —Å—Å—ã–ª–∞—è—Å—å –Ω–∞ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ–µ –Ω–∞—Ä—É—à–µ–Ω–∏–µ —Ä–∞–±–æ—Ç—ã –º–∏—Ä–æ–≤—ã—Ö —Ä—ã–Ω–∫–æ–≤ —ç–Ω–µ—Ä–≥–æ–Ω–æ—Å–∏—Ç–µ–ª–µ–π –∏ –¥—Ä—É–≥–∏—Ö —Ç–æ–≤–∞—Ä–æ–≤ —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ —ç–∫—Å–ø–æ—Ä—Ç–∞ –µ—â–µ –æ–¥–Ω–∏–º –ø—Ä–µ–¥–º–µ—Ç–æ–º –±–µ—Å–ø–æ–∫–æ–π—Å—Ç–≤–∞ —è–≤–ª

In [127]:
def get_lemma_vec(text):
    doc = nlp(text)
    result = []
    for token in doc:
        result.append([token.lemma])
    return result

In [141]:
tokenize_spacy('—Ö–æ—Ö–ª—è—Ü–∫–æ–≥–æ')


[['—Ö–æ—Ö–ª—è—Ü–∫–æ–≥–æ',
  'ADJ',
  'ROOT',
  '—Ö–æ—Ö–ª—è—Ü–∫–æ–≥–æ',
  Animacy=Anim|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing]]

# OOV handling

https://spacy.io/usage/processing-pipelines#custom-components-user-hooks

should we add a hook that returns zero vectors for OOV terms?

In [135]:
%%time
df_channels['lemmas'] = df_channels['message_words_only_lower'].apply(lambda x: get_lemmas(x))


KeyboardInterrupt



In [136]:
%%time
df_channels['tokens'] = df_channels['message_words_only_lower'].apply(lambda x: tokenize_spacy(x))

KeyboardInterrupt: 

In [137]:
df_channels

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,channel_name,...,message_no_emoji,url_list,message_no_urls,mentions,message_no_mentions,quotes,hashtags,message_no_hashtags,message_clean,message_words_only_lower
0,12347,2022-12-15 16:32:15+00:00,9914.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1183570279),,–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å...,photo,,mardanaka,...,–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å...,[www.youtube.com],–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å,[],–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å,[],{},–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å,–ê —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç. –ü–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å,–∞ —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è —Ç–µ–º –≤—Ä–µ–º–µ–Ω–µ–º –∏–¥–µ—Ç –ø–æ–¥–ø–∏—Å—ã–≤–∞–π—Ç–µ—Å—å
1,12346,2022-12-15 15:00:03+00:00,29207.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1183570279),,"–í –†–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ, –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å ...",photo,,mardanaka,...,"–í –†–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ, –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å ...",[],"–í –†–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ, –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å ...",[],"–í –†–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ, –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å ...",[–ú–≠–§. –≠–∫–æ–Ω–æ–º–∏–∫–∞ –¥–ª—è –ª—é–¥–µ–π],{},"–í –†–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ, –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å ...","–í –†–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ, –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å ...",–≤ —Ä–æ—Å—Å–∏–∏ –µ—â—ë –µ—Å—Ç—å —Ç–µ –∫—Ç–æ —Å–ø–æ—Å–æ–±–µ–Ω –≤–æ–∑—Ä–æ–∂–¥–∞—Ç—å –Ω...
2,12345,2022-12-15 14:21:22+00:00,41058.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1183570279),,¬´–ö–∞—Ç–∞—Ä–≥–µ–π—Ç¬ª –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã - –û–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ...,text,,mardanaka,...,¬´–ö–∞—Ç–∞—Ä–≥–µ–π—Ç¬ª –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã - –û–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ...,[],¬´–ö–∞—Ç–∞—Ä–≥–µ–π—Ç¬ª –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã - –û–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ...,[],"""–ö–∞—Ç–∞—Ä–≥–µ–π—Ç"" –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã - –û–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ...",[–ö–∞—Ç–∞—Ä–≥–µ–π—Ç],{},"""–ö–∞—Ç–∞—Ä–≥–µ–π—Ç"" –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã - –û–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ...",–ö–∞—Ç–∞—Ä–≥–µ–π—Ç –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã - –û–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ—Ä—Ä...,–∫–∞—Ç–∞—Ä–≥–µ–π—Ç –Ω–∞–±–∏—Ä–∞–µ—Ç –æ–±–æ—Ä–æ—Ç—ã –æ–±–≤–∏–Ω—è–µ–º–∞—è –≤ –∫–æ—Ä—Ä—É–ø...
3,12344,2022-12-15 13:08:35+00:00,40696.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1183570279),"MessageFwdHeader(date=datetime.datetime(2022, ...",üî•–í –≥–æ—Å—Ç—è—Ö —É @Metametrica –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ ...,photo,,mardanaka,...,–í –≥–æ—Å—Ç—è—Ö —É @Metametrica –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ ...,[],–í –≥–æ—Å—Ç—è—Ö —É @Metametrica –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ ...,"[Metametrica, Metametrica]",–í –≥–æ—Å—Ç—è—Ö —É –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–≥–æ –ø...,[–ë–æ–≥–¥–∞–Ω–∞ –•–º–µ–ª—å–Ω–∏—Ü–∫–æ–≥–æ],{},–í –≥–æ—Å—Ç—è—Ö —É –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–≥–æ –ø...,–í –≥–æ—Å—Ç—è—Ö —É –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–≥–æ –ø—Ä...,–≤ –≥–æ—Å—Ç—è—Ö —É –ø–æ–±—ã–≤–∞–ª–∏ —Ç–æ–≤–∞—Ä–∏—â–∏ –∏–∑ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–≥–æ –ø—Ä...
4,12343,2022-12-15 12:31:23+00:00,51690.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1183570279),,–ì–ª–∞–≤–∞ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –ì—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...,text,,mardanaka,...,–ì–ª–∞–≤–∞ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –ì—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...,[],–ì–ª–∞–≤–∞ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –ì—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...,[],–ì–ª–∞–≤–∞ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –ì—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...,"[–ï—Å—Ç—å —Å–ª—É—á–∞–∏, –∫–æ–≥–¥–∞ –µ–¥—É—Ç –≤–æ–µ–≤–∞—Ç—å –∑–∞ –≥–æ–Ω–æ—Ä–∞—Ä, –≤...",{},–ì–ª–∞–≤–∞ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –ì—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...,–ì–ª–∞–≤–∞ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –ì—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...,–≥–ª–∞–≤–∞ –º–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –≥—Ä—É–∑–∏–∏ –Ω–∞–∑–≤–∞–ª –≥—Ä—É–∑–∏–Ω—Å–∫–∏—Ö –Ω–∞–µ–º...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607921,7,2019-09-28 10:13:52+00:00,3190.0,,PeerChannel(channel_id=1253974160),,–ë–æ—Ç –¥–ª—è —Å–≤—è–∑–∏: @obrazbuduschego2_bot –î–æ–Ω–∞—Å—Ç—Ä–æ–µ...,text,,obrazbuduschego2,...,–ë–æ—Ç –¥–ª—è —Å–≤—è–∑–∏: @obrazbuduschego2_bot –î–æ–Ω–∞—Å—Ç—Ä–æ–µ...,[],–ë–æ—Ç –¥–ª—è —Å–≤—è–∑–∏: @obrazbuduschego2_bot –î–æ–Ω–∞—Å—Ç—Ä–æ–µ...,"[obrazbuduschego2_bot, protonmail]","–ë–æ—Ç –¥–ª—è —Å–≤—è–∑–∏: –î–æ–Ω–∞—Å—Ç—Ä–æ–µ–Ω, —Ä–∞–±–æ—Ç–∞–µ—Ç, –µ—Å–ª–∏ –∫—Ç–æ...",[],{},"–ë–æ—Ç –¥–ª—è —Å–≤—è–∑–∏: –î–æ–Ω–∞—Å—Ç—Ä–æ–µ–Ω, —Ä–∞–±–æ—Ç–∞–µ—Ç, –µ—Å–ª–∏ –∫—Ç–æ...","–ë–æ—Ç –¥–ª—è —Å–≤—è–∑–∏: –î–æ–Ω–∞—Å—Ç—Ä–æ–µ–Ω, —Ä–∞–±–æ—Ç–∞–µ—Ç, –µ—Å–ª–∏ –∫—Ç–æ-...",–±–æ—Ç –¥–ª—è —Å–≤—è–∑–∏ –¥–æ–Ω–∞—Å—Ç—Ä–æ–µ–Ω —Ä–∞–±–æ—Ç–∞–µ—Ç –µ—Å–ª–∏ –∫—Ç–æ—Ç–æ –Ω...
1607922,6,2019-09-28 09:35:07+00:00,3239.0,,PeerChannel(channel_id=1253974160),,"–ü–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –ú–∞—Ä–∫–µ –ó–∞—Ö–∞—Ä–æ–≤–µ, —á–µ–ª–æ–≤–µ–∫–µ, –∫–æ...",text,,obrazbuduschego2,...,"–ü–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –ú–∞—Ä–∫–µ –ó–∞—Ö–∞—Ä–æ–≤–µ, —á–µ–ª–æ–≤–µ–∫–µ, –∫–æ...",[],"–ü–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –ú–∞—Ä–∫–µ –ó–∞—Ö–∞—Ä–æ–≤–µ, —á–µ–ª–æ–≤–µ–∫–µ, –∫–æ...",[],"–ü–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –ú–∞—Ä–∫–µ –ó–∞—Ö–∞—Ä–æ–≤–µ, —á–µ–ª–æ–≤–µ–∫–µ, –∫–æ...","[–ß–∞–π, –∫–æ—Ñ–µ –∏ –¥—Ä—É–≥–∏–µ –∫–æ–ª–æ–Ω–∏–∞–ª—å–Ω—ã–µ —Ç–æ–≤–∞—Ä—ã]",{},"–ü–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –ú–∞—Ä–∫–µ –ó–∞—Ö–∞—Ä–æ–≤–µ, —á–µ–ª–æ–≤–µ–∫–µ, –∫–æ...","–ü–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –ú–∞—Ä–∫–µ –ó–∞—Ö–∞—Ä–æ–≤–µ, —á–µ–ª–æ–≤–µ–∫–µ, –∫–æ...",–ø–µ—á–∞–ª—å–Ω–∞—è –≤–µ—Å—Ç—å –æ –º–∞—Ä–∫–µ –∑–∞—Ö–∞—Ä–æ–≤–µ —á–µ–ª–æ–≤–µ–∫–µ –∫–æ—Ç–æ...
1607923,5,2019-09-28 07:34:47+00:00,314.0,,PeerChannel(channel_id=1253974160),"MessageFwdHeader(date=datetime.datetime(2019, ...","–¢—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω, —á—Ç–æ, –∫–æ–Ω–µ—á–Ω–æ, –∑–∞...",text,,obrazbuduschego2,...,"–¢—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω, —á—Ç–æ, –∫–æ–Ω–µ—á–Ω–æ, –∑–∞...",[],"–¢—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω, —á—Ç–æ, –∫–æ–Ω–µ—á–Ω–æ, –∑–∞...",[],"–¢—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω, —á—Ç–æ, –∫–æ–Ω–µ—á–Ω–æ, –∑–∞...",[],{},"–¢—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω, —á—Ç–æ, –∫–æ–Ω–µ—á–Ω–æ, –∑–∞...","–¢—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω, —á—Ç–æ, –∫–æ–Ω–µ—á–Ω–æ, –∑–∞...",—Ç—Ä–∞–º–ø –Ω–∞—Å—Ç–æ–ª—å–∫–æ —ç–∫—Å—Ü–µ–Ω—Ç—Ä–∏—á–µ–Ω —á—Ç–æ –∫–æ–Ω–µ—á–Ω–æ –∑–∞—Å–ª—É...
1607924,4,2019-09-28 07:08:55+00:00,4339.0,,PeerChannel(channel_id=1253974160),,–û—Ç—Å—Ç–∞–≤–∫–∞ –ö—É—Ä—Ç–∞ –í–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...,text,,obrazbuduschego2,...,–û—Ç—Å—Ç–∞–≤–∫–∞ –ö—É—Ä—Ç–∞ –í–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...,[],–û—Ç—Å—Ç–∞–≤–∫–∞ –ö—É—Ä—Ç–∞ –í–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...,[],–û—Ç—Å—Ç–∞–≤–∫–∞ –ö—É—Ä—Ç–∞ –í–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...,"[–¥–æ–º–æ–≥–∞—Ç–µ–ª—å—Å—Ç–≤, –ø–∞—Ä—Ç–∏—è –ö–æ–∑–∞–∫–∞-–õ–∞–≤—Ä–æ–≤–∞, –ø–∞—Ä—Ç–∏—è ...",{},–û—Ç—Å—Ç–∞–≤–∫–∞ –ö—É—Ä—Ç–∞ –í–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...,–û—Ç—Å—Ç–∞–≤–∫–∞ –ö—É—Ä—Ç–∞ –í–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...,–æ—Ç—Å—Ç–∞–≤–∫–∞ –∫—É—Ä—Ç–∞ –≤–æ–ª–∫–µ—Ä–∞ —Å –ø–æ—Å—Ç–∞ —Å–ø–µ—Ü–ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ...


In [144]:
df_channels.message_words_only_lower.iloc[33330]

'–∏ –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –µ–ª–µ–Ω—É —Å–µ—Ä–≤–µ—Ç—Ç–∞–∑ –æ–Ω–∞ —Ç–∞–º –º–µ—Å—Ç–∞–º–∏ —Å—Ä—ã–≤–∞–µ—Ç –ø–æ–∫—Ä–æ–≤—ã –ø—Ä–æ –∞–ª—å–±–∞—Ü –±—Ä–∞—É–¥–µ—Ä–∞ –∏ —Ç–∞–∫ –¥–∞–ª–µ–µ –∏ –≤–æ–æ–±—â–µ –æ–Ω–∞ –æ—á–µ–Ω—å —Ö–æ—Ä–æ—à–∞—è'

In [None]:
#pymorphy2
#sparknlp
#–Ω–∞ —á–æ–º—É —Ç—Ä–µ–Ω—É–≤–∞–≤—Å—è spacy - –≤–∑—è—Ç–∏ –∑ —Ü—å–æ–≥–æ –∫–æ—Ä–ø—É—Å—É —Å–ª–æ–≤–Ω–∏–∫
#–≤–∑—è—Ç–∏—Å–ª–æ–≤–Ω–∏–∫
#—Å–ª—É–∂–±–æ–≤—ñ —á–∞—Å—Ç–∏–Ω–∏ –º–æ–≤–∏ –∑–∞–±—Ä–∞—Ç–∏

In [1]:
import pandas as pd

In [None]:
%%time
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/data/merged_dataset/df_channels.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'id', 'date', 'message', 'channel_name', 'lang',
       'message_no_hashtags', 'date_formatted', 'message_clean',
       'message_final'],
      dtype='object')