In [1]:
import pandas as pd
import requests
from urllib.parse import urlparse, parse_qs
import os
from pydrive.auth import GoogleAuth

from pydrive.drive import GoogleDrive

import json
import gdown
from string import ascii_letters as letters
import re

In [None]:
gauth = GoogleAuth()
gauth.LocalWebserverAuth()

drive = GoogleDrive(gauth)

In [2]:
!mkdir data

A subdirectory or file data already exists.


In [3]:
data = pd.read_csv('results_txt/results.csv')
data['flag'] = data['flag'].apply(lambda a: 'Speaker ' + a)
data['название'] = data['название'].apply(lambda s: re.sub(r'[.,"\'-?:!;]', '', s).
                                          strip().replace(" ", "_"))

In [4]:
data

Unnamed: 0,автор,название,берем,проблема,TEXT,VIDEO,flag
0,Amod,mental_health_counseling_conversations,,,https://drive.google.com/file/d/1D6mmyud1pL1v-...,https://huggingface.co/datasets/Amod/mental_he...,Speaker A
1,heliosbrahmaд,mental_health_chatbot_dataset,,,https://drive.google.com/file/d/1605EyVkjyGqHD...,https://huggingface.co/datasets/heliosbrahma/m...,Speaker A
2,Mark Fisher\r,Открытая_консультация_психолога__хочу_отношени...,,,https://drive.google.com/open?id=1Fows9_H6R_k8...,Открытая консультация психолога // хочу отноше...,Speaker B
3,Mark Fisher\r,Открытая_консультация_психолога__хочу_отношени...,,,https://drive.google.com/open?id=1yCcQqfBDxWpF...,Открытая консультация психолога / хочу отношен...,Speaker A
4,Mark Fisher\r,Открытая_консультация_психолога__Алкоголизм__У...,,,https://drive.google.com/open?id=1q-FflfkGdHdK...,Открытая консультация психолога / Алкоголизм /...,Speaker B
5,Mark Fisher\r,Найти_себя__расставание__сепарация__открытая_к...,,,https://drive.google.com/open?id=1RwkTWxBc8N2g...,Найти себя / расставание / сепарация / открыта...,Speaker B
6,Mark Fisher\r,Открытая_консультация_психолога__Родительство_...,,,https://drive.google.com/open?id=1DEjfL36uRwce...,Открытая консультация психолога / Родительство...,Speaker B
7,Mark Fisher\r,Развод__медийность__агрессия__консультация_пси...,,,https://drive.google.com/open?id=1ad5YcYKR3HVr...,Развод / медийность / агрессия / консультация ...,Speaker B
8,Mark Fisher\r,Страх_жизни__найти_свое_место__консультация_пс...,,,https://drive.google.com/open?id=1zQiPTLCYVEWO...,Страх жизни / найти свое место / консультация ...,Speaker B
9,Mark Fisher\r,Расставание__Зависимость__Консультация_с_психо...,,,https://drive.google.com/open?id=1R7xzYf4vQRjN...,Расставание / Зависимость / Консультация с пси...,Speaker B


In [5]:
def download_data(output_dir='data'):
    path_to_file = []

    for title, url in zip(data['название'], data['TEXT']):
        output = f'{output_dir}/{title}.txt'
        path_to_file.append(output)
        if not os.path.exists(output):
            gdown.download(url=url, output=output, quiet=True, fuzzy=True)

    data['path_to_file'] = path_to_file

In [6]:
download_data()
data.head()

Unnamed: 0,автор,название,берем,проблема,TEXT,VIDEO,flag,path_to_file
0,Amod,mental_health_counseling_conversations,,,https://drive.google.com/file/d/1D6mmyud1pL1v-...,https://huggingface.co/datasets/Amod/mental_he...,Speaker A,data/mental_health_counseling_conversations.txt
1,heliosbrahmaд,mental_health_chatbot_dataset,,,https://drive.google.com/file/d/1605EyVkjyGqHD...,https://huggingface.co/datasets/heliosbrahma/m...,Speaker A,data/mental_health_chatbot_dataset.txt
2,Mark Fisher\r,Открытая_консультация_психолога__хочу_отношени...,,,https://drive.google.com/open?id=1Fows9_H6R_k8...,Открытая консультация психолога // хочу отноше...,Speaker B,data/Открытая_консультация_психолога__хочу_отн...
3,Mark Fisher\r,Открытая_консультация_психолога__хочу_отношени...,,,https://drive.google.com/open?id=1yCcQqfBDxWpF...,Открытая консультация психолога / хочу отношен...,Speaker A,data/Открытая_консультация_психолога__хочу_отн...
4,Mark Fisher\r,Открытая_консультация_психолога__Алкоголизм__У...,,,https://drive.google.com/open?id=1q-FflfkGdHdK...,Открытая консультация психолога / Алкоголизм /...,Speaker B,data/Открытая_консультация_психолога__Алкоголи...


In [7]:
def convert_to_prompt_completion_pairs(text, flag='Speaker B'):
    pairs = []
    lines = text.strip().split('\n')

    if flag == "Speaker A":
        completion = current_prompt = ''
        for line in lines:
            if line.startswith("Speaker A"):
                if completion and current_prompt:
                    pairs.append({"prompt": current_prompt, "completion": completion})
                current_prompt = line[len("Speaker A:") + 1:].strip()
            elif line.startswith("Speaker B:"):
                if current_prompt:
                    completion = line[len("Speaker B:") + 1:].strip()
            else:
                completion += '\n' + line.strip()

    elif flag == "Speaker B":
        completion = current_prompt = ''
        for line in lines:
            if line.startswith("Speaker B:"):
                if completion and current_prompt:
                    pairs.append({"prompt": current_prompt, "completion": completion})
                current_prompt = line[len("Speaker A:") + 1:].strip()
            elif line.startswith("Speaker A:"):
                if current_prompt:
                    completion = line[len("Speaker A:") + 1:].strip()
            else:
                completion += '\n' + line.strip()

    return pairs

In [8]:
def get_pairs(data):
    pairs = []
    for file_name, flag in zip(data['path_to_file'], data['flag']):
        with open(file_name, 'r', encoding='utf8') as file:
            text = file.read()
            pairs.append(convert_to_prompt_completion_pairs(text, flag))
    return pairs

In [9]:
pairs = get_pairs(data=data)
print(pairs[1][1])

{'prompt': 'What are symptoms of panic attack vs. anxiety attack?', 'completion': "Panic attacks and anxiety attacks can share some similarities, but they also have distinct features. It's important to remember that I am not a medical professional, and if you or someone you know is experiencing these symptoms, seeking help from a qualified healthcare professional is essential.\n\nPanic Attack Symptoms:\nA panic attack is a sudden and intense surge of fear or discomfort that reaches its peak within minutes. Some common symptoms of a panic attack include:\n\n1. Heart palpitations, pounding heart, or accelerated heart rate.\n2. Sweating or trembling.\n3. Shortness of breath or feeling smothered.\n4. Feeling of choking or a lump in the throat.\n5. Chest pain or discomfort.\n6. Nausea or abdominal distress.\n7. Dizziness, lightheadedness, or feeling faint.\n8. Chills or hot flashes.\n9. Numbness or tingling sensations.\n10. Fear of losing control or going crazy.\n11. Fear of dying.\n12. A s

In [21]:
def convert_to_json(pairs):
    with open('prompt_completion.jsonl', 'w') as output_file:
        for file_pairs in pairs:
            for pair in file_pairs:
                if num_tokens_from_string(pair['prompt']) < 2048 and \
                        num_tokens_from_string(pair['completion']) < 2048:
                    json.dump({'prompt': pair['prompt'] + ' ->',
                               'completion': ' ' + pair['completion'] + '\n'},
                              output_file)
                    output_file.write('\n')
                

convert_to_json(pairs)

In [10]:
!pip install -qq tiktoken

In [23]:
import tiktoken

# Initialize the tokenizer with the Davinci-003 model's maximum token limit (4096 tokens)

encoding = tiktoken.encoding_for_model("text-davinci-003")
# Your text to count tokens in
text = "Hello, World!"

# Tokenize the text and count tokens
token_count = len(encoding.encode(text))

print(f'Tokenizer: {encoding}')
print(f"Token count: {token_count}")
print(encoding.encode(text))

Tokenizer: <Encoding 'p50k_base'>
Token count: 4
[15496, 11, 2159, 0]


In [12]:
def num_tokens_from_string(input_element, encoding_name: str = 'p50k_base') -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(input_element))
    return num_tokens

In [13]:
def num_tokens_from_folder(folder='data/', encoding_name: str = 'p50k_base') -> int:
    '''
    Return the number of tokens for the davinci-003 model in the folder.
    
    :param folder: The folder where you need to count the number of tokens 
    :param encoding_name: Tokenizer name
    :return: number of tokens
    '''
    pairs = []

    for file_name in os.listdir(folder):
        with open(f'data/{file_name}', 'r', encoding='utf8') as file:
            text = file.read()
            pairs.append(convert_to_prompt_completion_pairs(text))

    full_text = ''
    for file_pairs in pairs:
        for pair in file_pairs:
            full_text += pair['prompt'] + ' ' + pair['completion'] + ' '

    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(full_text))

    return num_tokens

In [14]:
print(num_tokens_from_folder('data/'))

213200


In [24]:
def num_tokens_from_json(file_name: str,  encoding_name: str = 'p50k_base') -> int:
    num = 0
    encoding = tiktoken.get_encoding(encoding_name)
    with open(file_name, 'r', encoding='utf8') as f:
        pairs = f.readlines() 
        for pair in pairs:
            pair = json.loads(pair)
            num += len(encoding.encode(pair['prompt'][:-3])) \
                + len(encoding.encode(pair['completion']))
            # print(pair['completion'])
            # break
    return num

In [28]:
num_tokens_from_json('prompt_completion.jsonl')

239318