### **Libraries**

In [None]:
import pandas as pd
import numpy as np
from tpulse import TinkoffPulse
from time import sleep
from pytz import timezone
from httpx import HTTPStatusError
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import re

import warnings
warnings.filterwarnings('ignore')

### **Parsing Data by tickers**

In [None]:
pulse = TinkoffPulse()

def parsing_tpulse(ticker, N, KEYS):

    cursor = 999999999
    raw_data = []

    for _ in range(N):
        try:
            response = pulse.get_posts_by_ticker(ticker, cursor)
            try:
                cursor = response["nextCursor"]
            except:
                print('Error: last_cursor')
                return raw_data
            posts = response["items"]
            for post in posts:
                data = {
                    key: post[key] for key in KEYS
                }
                data['text'] = post['content']['text']
                data['reactions_counters'] = post['reactions']['counters']
                raw_data.append(data)

        except HTTPStatusError:
            pass
        sleep(0.25)

    print('Parsing finished.')

    print('Saving data in csv...')
    result_df = pd.DataFrame(raw_data)
    result_df['inserted'] = pd.to_datetime(result_df["inserted"].str[:10])
    result_df.to_csv(f'df_{ticker}_data.csv', encoding='utf-8-sig', index=False)
    print('Saving finished.')

    return raw_data


Поочередно парсим данные по тикерам

In [None]:
KEYS = [
        "inserted",
        "likesCount",
        "commentsCount",
        ]

result = parsing_tpulse("SBER", 5000, KEYS)

base_df = pd.DataFrame(result)
base_df['inserted'] = pd.to_datetime(base_df["inserted"].str[:10])
base_df.to_csv('df_sber_data.csv', encoding='utf-8-sig', index=False)

Parsing finished.


### **Text Preprocessing with RegEx**

In [None]:
import spacy
import en_core_web_trf
import spacy_transformers
import re

# Load the spaCy model
nlp = spacy.load("ru_core_news_lg")

def split_into_sentences_by_meaning(text):

    # Process the text through the spaCy NLP pipeline
    doc = nlp(text)
    
    # Extract sentences based on spaCy's parsing
    sentences = [sent.text for sent in doc.sents]
    
    return sentences

In [None]:
def remove_emojis(text):

    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [None]:
def find_companies(text):

    # Regex pattern to match words inside curly brackets
    pattern = r'\{([^}]+)\}'
    
    # Find all matches of the pattern in the text
    matches = re.findall(pattern, text)
    
    return matches

In [None]:
def find_related_chunk(text, words):
    
    # Saving list
    related_sentence_list = []

    # Ensure words list is lowercased for case-insensitive matching
    words_lower = [word.lower() for word in words]
    
    # Tokenize the text into sentences
    sentences = split_into_sentences_by_meaning(text)
    
    # Search for sentences containing any of the words
    for sentence in sentences:
        if any(word in sentence.lower() for word in words_lower):
            related_sentence_list.append(sentence)  # Return the first sentence found that matches
    
    # If no sentence is found containing any of the words, return an empty string
    return '. '.join(related_sentence_list)

In [None]:
def preprocess_pulse_comments(row):

    raw_text = row['text']
    current_text = remove_emojis(raw_text)
    current_text = current_text.lower()

    companies_mentioned = find_companies(current_text)
    if len(companies_mentioned) <= 1:
        return current_text
    else: 
        words = ticker_keywords[ticker]
        current_text = current_text.replace('\n', '. ')
        related_text = find_related_chunk(current_text, words)
        return related_text

In [None]:
ticker_keywords = {
    "tcs": ['tinkoff', 'tcs', 'tcsg', 'тинькофф', 'ткс', 'тинек', 'тинёк', 'тиньков', 'тинька'],
    "sber": ['sber', 'sbrf', 'sberbank', 'сбер', 'сбербанк', 'cбера', 'сберу', 'sberp', 'сбербанкa', 'сбербанку', 'сбербанком'],
    "gazp": ['gazp', 'газпром', 'газпрома', 'газпрому', 'gazprom', 'газпромом', 'gzpr', 'газп', 'gaz'],
    "bane": ['bashneft', 'башнефть', 'bane', 'баш', 'башнефти', 'башнефтью'],
    "kmaz": ['kamaz', 'камаз', 'kmaz', 'кмаз', 'камазом', 'камаза', 'камазу'],
    "mvid": ['mvideo', 'мвидео', 'м-видео', 'mvid', 'мвид', 'эмвидео', 'м видео'],
    "pikk": ['pik', 'пик', 'pikk', 'пикк'],
    "rtkm": ['rostelecom', 'ростелеком', 'rtkm', 'рткм', 'ростел', 'ростелекома', 'ростелекому', 'ростелекомом'],
    "sgzh": ['sgzh', 'сгж', 'сегежа', 'segezha', 'сегежи', 'сегежей', 'сегежу'],
    "yndx": ['yandex', 'яндекс', 'yndx', 'индекс', 'yndex', 'яшка', 'яндекса', 'яндексу', 'яндексом']
    }

In [None]:
for dataset_name in os.listdir('raw_data'):
        
    if dataset_name in ['df_SBER_data.csv']:

        df = pd.read_csv(rf'raw_data/{dataset_name}')
        
        ticker = dataset_name.split("_")[1].lower()

        df['text_preprocessed'] = df.apply(preprocess_pulse_comments, axis=1)

        df.to_csv(f'preprocessed_data_full/df_{ticker}_full.csv', index=False)

        print(f'Done: {ticker}')

    

Done: sber
