<a href="https://colab.research.google.com/github/kiryaa865/uni-thesis4/blob/main/processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Встановлення бібліотеки для доступу до гугл-таблиці

In [None]:
!pip install google-api-python-client gspread oauth2client

# Код для обрахунку точності категорій на основі кольорів, позначених експертами в таблиці

In [10]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
from collections import defaultdict

def get_highlight_colors_by_header(spreadsheet_id, sheet_name, headers):
    # Define the scope and credentials
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    creds = ServiceAccountCredentials.from_json_keyfile_name('/content/gold-fiber-345715-6ef065b6c65d.json', scope)
    client = gspread.authorize(creds)

    # Open the spreadsheet and sheet
    sheet = client.open_by_key(spreadsheet_id).worksheet(sheet_name)

    # Authorize and build the Google Sheets API service
    service = build('sheets', 'v4', credentials=creds)

    # Get the spreadsheet metadata to find the sheet ID
    sheet_metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
    sheet_id = None
    for s in sheet_metadata['sheets']:
        if s['properties']['title'] == sheet_name:
            sheet_id = s['properties']['sheetId']
            break

    if sheet_id is None:
        raise ValueError(f"Sheet {sheet_name} not found in the spreadsheet.")

    # Get the cell formatting information
    sheet_data = service.spreadsheets().get(
        spreadsheetId=spreadsheet_id,
        ranges=[sheet_name],
        fields="sheets.data.rowData.values.userEnteredFormat.backgroundColor,sheets.data.rowData.values.effectiveValue"
    ).execute()

    rows = sheet_data['sheets'][0]['data'][0]['rowData']

    # Initialize the dictionary to store color counts for each header
    color_dict = {header: defaultdict(int) for header in headers}

    # Find all header positions
    header_positions = {header: [] for header in headers}
    header_row = rows[0]['values']
    for idx, cell in enumerate(header_row):
        if 'effectiveValue' in cell:
            header = cell['effectiveValue'].get('stringValue', '')
            if header in headers:
                header_positions[header].append(idx)

    # Iterate over the rows to count colors
    for row in rows[1:]:
        if 'values' in row:
            for header, positions in header_positions.items():
                for pos in positions:
                    if pos < len(row['values']):
                        cell = row['values'][pos]
                        if 'userEnteredFormat' in cell and 'backgroundColor' in cell['userEnteredFormat']:
                            background_color = cell['userEnteredFormat']['backgroundColor']
                            color = (
                                background_color.get('red', 0),
                                background_color.get('green', 0),
                                background_color.get('blue', 0)
                            )
                            color_dict[header][color] += 1

    return color_dict

def calculate_percentage(count1, count2):
    if count2 == 0:
        return 0
    return (count1 / count2) * 100

def get_color_counts(color_dict, color_tuple):
    return color_dict.get(color_tuple, 0)

# Colors defined in the provided data
green = (0, 1, 0)
red = (1, 0, 0)
magenta = (1, 0, 1)
pinkish = (0.91764706, 0.81960785, 0.8627451)
cyan = (0, 1, 1)
orange = (1, 0.6, 0)
yellow = (1, 1, 0)
black = (0, 0, 0)
zelenyy = (0.21960784, 0.4627451, 0.11372549)
dark_purple = (0.45490196, 0.105882354, 0.2784314)

# Example usage
spreadsheet_id = '1NIKSFu2LNSWD2IReRMI7e9XezKGoejiUdvjHSqQEn70'
sheet_name = 'combined2_csv'
headers = ['Token', 'GPT-3.5', 'Stanza', 'SpaCy', 'Pymorphy3', 'Flair', 'RoBERTa']
highlight_colors = get_highlight_colors_by_header(spreadsheet_id, sheet_name, headers)

# Print the results
for header, color_counts in highlight_colors.items():
    count_green = get_color_counts(color_counts, green)
    count_red = get_color_counts(color_counts, red)
    count_magenta = get_color_counts(color_counts, magenta)
    count_pinkish = get_color_counts(color_counts, pinkish)
    count_cyan = get_color_counts(color_counts, cyan)
    count_orange = get_color_counts(color_counts, orange)
    count_yellow = get_color_counts(color_counts, yellow)
    count_black = get_color_counts(color_counts, black)
    count_zelenyy = get_color_counts(color_counts, zelenyy)
    count_dark_purple = get_color_counts(color_counts, dark_purple)

    green_red_percentage = calculate_percentage(count_green, count_green + count_red)
    magenta_pinkish_percentage = calculate_percentage(count_magenta, count_magenta + count_pinkish)
    cyan_orange_percentage = calculate_percentage(count_cyan, count_cyan + count_orange)
    yellow_black_percentage = calculate_percentage(count_yellow, count_yellow + count_black)
    zelenyy_dark_purple_percentage = calculate_percentage(count_zelenyy, count_zelenyy + count_dark_purple)






    print(f"Header: {header}")
    print(f"  Омонімія: {green_red_percentage:.2f}%")
    print(f"  Англ-укр: {magenta_pinkish_percentage:.2f}%")
    print(f"  Одруківки, капіталізація: {cyan_orange_percentage:.2f}%")
    print(f"  Англ-англ: {yellow_black_percentage:.2f}%")
    print(f"  Нецензурна лексика, укр сленг: {zelenyy_dark_purple_percentage:.2f}%")


Header: Token
  Омонімія: 0.00%
  Англ-укр: 0.00%
  Одруківки, капіталізація: 0.00%
  Англ-англ: 100.00%
  Нецензурна лексика, укр сленг: 100.00%
Header: GPT-3.5
  Омонімія: 67.14%
  Англ-укр: 84.49%
  Одруківки, капіталізація: 89.01%
  Англ-англ: 97.62%
  Нецензурна лексика, укр сленг: 93.33%
Header: Stanza
  Омонімія: 49.28%
  Англ-укр: 19.05%
  Одруківки, капіталізація: 58.15%
  Англ-англ: 0.48%
  Нецензурна лексика, укр сленг: 52.50%
Header: SpaCy
  Омонімія: 55.07%
  Англ-укр: 18.36%
  Одруківки, капіталізація: 48.31%
  Англ-англ: 0.72%
  Нецензурна лексика, укр сленг: 49.17%
Header: Pymorphy3
  Омонімія: 48.53%
  Англ-укр: 19.63%
  Одруківки, капіталізація: 44.79%
  Англ-англ: 0.24%
  Нецензурна лексика, укр сленг: 40.00%
Header: Flair
  Омонімія: 57.97%
  Англ-укр: 19.71%
  Одруківки, капіталізація: 64.51%
  Англ-англ: 0.24%
  Нецензурна лексика, укр сленг: 44.17%
Header: RoBERTa
  Омонімія: 69.57%
  Англ-укр: 13.87%
  Одруківки, капіталізація: 67.89%
  Англ-англ: 0.97%
  Неценз

# Код для обрахунку збігів між моделями при тегуванні літературних текстів

In [11]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
from collections import defaultdict

# Pymorphy3 tag mapping
pymorphy3_mapping = {
    'ADJF': 'ADJ',
    'ADVB': 'ADV',
    'PREP': 'ADP',
    'CONJ': 'CCONJ'
}

def read_sheet_data(spreadsheet_id, sheet_name, creds_path):
    # Define the scope and credentials
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    creds = ServiceAccountCredentials.from_json_keyfile_name(creds_path, scope)
    client = gspread.authorize(creds)

    # Open the spreadsheet and sheet
    sheet = client.open_by_key(spreadsheet_id).worksheet(sheet_name)

    # Read the data into a pandas DataFrame
    data = sheet.get_all_values()
    df = pd.DataFrame(data[1:], columns=data[0])

    return df

def segment_dataframe(df, headers):
    # Create a list to hold DataFrame segments
    segments = []

    # Iterate over columns in steps of the header length + 1 (to account for empty columns)
    header_length = len(headers)
    for start_col in range(0, df.shape[1], header_length + 1):
        end_col = start_col + header_length
        if end_col > df.shape[1]:
            break
        sub_df = df.iloc[:, start_col:end_col]
        segments.append(sub_df)

    return segments

def apply_pymorphy3_mapping(pos_tag, model_name):
    if model_name == 'Pymorphy3' and pos_tag in pymorphy3_mapping:
        return pymorphy3_mapping[pos_tag]
    return pos_tag

def calculate_concurrence(df_segments, headers):
    # Initialize a dictionary to store concurrence counts
    concurrence_counts = defaultdict(int)
    total_counts = defaultdict(int)

    for sub_df in df_segments:
        for index, row in sub_df.iterrows():
            token = str(row[headers[0]]).strip()
            if token == '':
                continue

            pos_tags = [apply_pymorphy3_mapping(str(row[header]).strip(), header) for header in headers[1:]]

            # Exclude rows with 'X' or 'None'
            if 'X' in pos_tags or 'None' in pos_tags:
                continue

            # Debugging output for pos_tags
            print(f"Token: {token}, POS Tags: {pos_tags}")

            for j in range(len(pos_tags)):
                for k in range(j + 1, len(pos_tags)):
                    total_counts[(headers[j+1], headers[k+1])] += 1
                    total_counts[(headers[k+1], headers[j+1])] += 1
                    if pos_tags[j] == pos_tags[k]:
                        concurrence_counts[(headers[j+1], headers[k+1])] += 1
                        concurrence_counts[(headers[k+1], headers[j+1])] += 1
                        # Detailed Debugging output
                        print(f"Concurrence: {headers[j+1]} - {headers[k+1]}: {pos_tags[j]} == {pos_tags[k]}")

    # Debugging output
    print("Concurrence Counts:")
    for (model1, model2), count in concurrence_counts.items():
        print(f"{model1} - {model2}: {count}")

    print("\nTotal Counts:")
    for (model1, model2), count in total_counts.items():
        print(f"{model1} - {model2}: {count}")

    # Calculate concurrence percentages
    concurrence_percentages = {}
    for (model1, model2), count in concurrence_counts.items():
        if total_counts[(model1, model2)] > 0:
            concurrence_percentages[(model1, model2)] = (count / total_counts[(model1, model2)]) * 100

    return concurrence_percentages

def format_concurrence_table(concurrence_percentages, headers):
    # Create a DataFrame to display concurrence percentages
    table = pd.DataFrame(index=headers[1:], columns=headers[1:])
    for (model1, model2), percentage in concurrence_percentages.items():
        table.at[model1, model2] = percentage
    return table

# Example usage
spreadsheet_id = '1sHdyscsk_KKLbJhU0UC1GZqbI9CESsXnydz1kpI6m3g'
sheet_name = 'regular_texts'
creds_path = '/content/gold-fiber-345715-6ef065b6c65d.json'
headers = ['Token', 'GPT-3.5', 'Stanza', 'SpaCy', 'Pymorphy3', 'Flair', 'RoBERTa']

df = read_sheet_data(spreadsheet_id, sheet_name, creds_path)
df_segments = segment_dataframe(df, headers)
concurrence_percentages = calculate_concurrence(df_segments, headers)
concurrence_table = format_concurrence_table(concurrence_percentages, headers)

print("Concurrence Table (Percentages):")
print(concurrence_table)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Concurrence: Flair - RoBERTa: NOUN == NOUN
Token: або, POS Tags: ['CCONJ', 'CCONJ', 'CCONJ', 'CCONJ', 'CCONJ', 'CCONJ']
Concurrence: GPT-3.5 - Stanza: CCONJ == CCONJ
Concurrence: GPT-3.5 - SpaCy: CCONJ == CCONJ
Concurrence: GPT-3.5 - Pymorphy3: CCONJ == CCONJ
Concurrence: GPT-3.5 - Flair: CCONJ == CCONJ
Concurrence: GPT-3.5 - RoBERTa: CCONJ == CCONJ
Concurrence: Stanza - SpaCy: CCONJ == CCONJ
Concurrence: Stanza - Pymorphy3: CCONJ == CCONJ
Concurrence: Stanza - Flair: CCONJ == CCONJ
Concurrence: Stanza - RoBERTa: CCONJ == CCONJ
Concurrence: SpaCy - Pymorphy3: CCONJ == CCONJ
Concurrence: SpaCy - Flair: CCONJ == CCONJ
Concurrence: SpaCy - RoBERTa: CCONJ == CCONJ
Concurrence: Pymorphy3 - Flair: CCONJ == CCONJ
Concurrence: Pymorphy3 - RoBERTa: CCONJ == CCONJ
Concurrence: Flair - RoBERTa: CCONJ == CCONJ
Token: третьою, POS Tags: ['ADJ', 'ADJ', 'ADJ', 'ADJ', 'ADJ', 'ADJ']
Concurrence: GPT-3.5 - Stanza: ADJ == ADJ
Concurrence: G

# Код для обрахунку загальної кількості слів та пунктуації в зібраних корпусах

In [12]:
import string

def count_words_and_punctuation(file_path):
    # Initialize counters
    word_count = 0
    punctuation_count = 0

    # Define punctuation characters
    punctuation_chars = string.punctuation

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Split the line into words
            words = line.split()
            word_count += len(words)

            # Count punctuation in the line
            for char in line:
                if char in punctuation_chars:
                    punctuation_count += 1

    return word_count, punctuation_count

# Example usage
file_path = '/content/gathered_corpora-1.txt'  # Replace with your file path
words, punctuation = count_words_and_punctuation(file_path)
print(f'Word Count: {words}')
print(f'Punctuation Count: {punctuation}')


Word Count: 4941
Punctuation Count: 382


# Код для конвертації датасетів

In [15]:
import json

def conllu_to_json(input_file, output_file, fraction=0.1):
    sentences = []
    sentence = []
    sentence_count = 0

    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        total_sentences = sum(1 for line in lines if line.strip() == '')
        max_sentences = int(total_sentences * fraction)

        for line in lines:
            line = line.strip()
            if line.startswith('#'):
                continue
            if line == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                    sentence_count += 1
                if sentence_count >= max_sentences:
                    break
                continue

            fields = line.split('\t')
            if len(fields) < 4:
                continue
            word = fields[1]
            pos = fields[3]
            sentence.append({'word': word, 'pos': pos})

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(sentences, file, ensure_ascii=False, indent=2)


def process_json(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        sentences = json.load(file)

    processed_data = []

    for sentence in sentences:
        words = ' '.join([token['word'] for token in sentence])
        tags = ' '.join([token['pos'] for token in sentence])

        processed_data.append({
            "messages": [
                {"role": "system", "content": "You are Postagger, an expert bot designed to perform Part-of-Speech (POS) tagging accurately for any language, including but not limited to Ukrainian and English. You will provide precise POS tags for all input text, and attempt to tag all words correctly. Keep an eye out for misspellings and attempt to tag them appropriately based on context. Always respond only with the tagged text and nothing else. The tagset you will use includes: NOUN, PUNCT, VERB, ADJ, ADP, ADV, PRON, CCONJ, DET, PART, PROPN, SCONJ, NUM, AUX, INTJ, SYM. The response format is token: pos/n."},
                {"role": "user", "content": words},
                {"role": "assistant", "content": tags}
            ]
        })

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(processed_data, file, ensure_ascii=False, indent=2)


def convert_json_to_jsonl(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    with open(output_file, 'w', encoding='utf-8') as file:
        for entry in data:
            file.write(json.dumps(entry, ensure_ascii=False) + '\n')


# Приклад використання
conllu_input = '/content/uk_iu-ud-train.conllu'
json_output = 'output.json'
jsonl_output = 'train_dataset.jsonl'

conllu_to_json(conllu_input, json_output, fraction=0.1)
process_json(json_output, json_output)
convert_json_to_jsonl(json_output, jsonl_output)
