<a href="https://colab.research.google.com/github/iyves/ru_col_suggest/blob/master/generate_wrong_colloc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The purpose of this notebook is in the generation of erroneous Russian academic collocations. The setup is as follows:
1. From the cybercat database, extract the 100 most common bi-grams and tri-grams that match a specified PoS filter:
 - V+N
 - N+N
 - Adj+N
 - V+V
 - V+Inf			
 - V+Prep+N
 - N+Prep+N
2. For each lemma in each match, substitute the headword with synonyms, queried from the [CrossLexica dictionary](https://www.xl.gelbukh.com/).

### SQL statement for bigrams (tokens):
```
SELECT * FROM
(SELECT a.raw_frequency, 
    uni1.unigram as "n1", uni2.unigram as "n2",
	uni1.morph as "n1_morph",  uni2.morph as "n2_morph"
FROM 
    (SELECT bi.wordform_1, bi.wordform_2, bi.raw_frequency
    FROM cybercat.2grams as bi
	ORDER BY bi.raw_frequency DESC
    LIMIT 10000) as a
LEFT JOIN cybercat.unigrams uni1 ON uni1.id_unigram = a.wordform_1
LEFT JOIN cybercat.unigrams uni2 ON uni2.id_unigram = a.wordform_2) as matches

where 
    (matches.n1_morph LIKE "%VerbForm%" AND 
	    (matches.n2_morph LIKE "%VerbForm%" OR
	     matches.n2_morph LIKE "%Animacy%")) OR
	(matches.n2_morph LIKE "%Animacy%" AND 
	    (matches.n1_morph LIKE "%Animacy%" OR
		 matches.n1_morph LIKE "%Degree%"))
	AND matches.n1 NOT IN ("А", "В", "Д", "Л", "И", "УК", "Н", "М", "КС", "Ф", "Оп")
    AND matches.n2 NOT IN ("А", "В", "Д", "Л", "И", "УК", "Н", "М", "КС", "Ф", "Оп")
LIMIT 300
```

### SQL statement for bigrams (lemmas):
```
SELECT raw_frequency, pos1, pos2, l1, l2
FROM (
	SELECT tokens.raw_frequency, 
		lemma1.lemma as "l1", lemma2.lemma as "l2",
		p1.pos as "pos1", p2.pos as "pos2"
	FROM
		(SELECT a.raw_frequency, 
			uni1.lemma as "n1", uni2.lemma as "n2"
		FROM 
			(SELECT bi.wordform_1, bi.wordform_2, bi.raw_frequency
			FROM cybercat.2grams as bi
			ORDER BY bi.raw_frequency DESC
			LIMIT 10000
			) as a
			
		LEFT JOIN cybercat.unigrams uni1 ON uni1.id_unigram = a.wordform_1
		LEFT JOIN cybercat.unigrams uni2 ON uni2.id_unigram = a.wordform_2
		) as tokens
		
	LEFT JOIN cybercat.lemmas lemma1 ON tokens.n1 = lemma1.id_lemmas
	LEFT JOIN cybercat.lemmas lemma2 ON tokens.n2 = lemma2.id_lemmas
	LEFT JOIN cybercat.pos p1 ON p1.id_pos = lemma1.id_pos
	LEFT JOIN cybercat.pos p2 ON p2.id_pos = lemma2.id_pos
) as matches

WHERE (matches.pos1 = 'VERB' AND matches.pos2 IN ('VERB', 'NOUN')) OR
	(matches.pos1 IN ('NOUN', 'ADJ') AND matches.pos2 = 'NOUN')
LIMIT 300
```

## SQL statement for trigrams (tokens):
```
SELECT * FROM
(SELECT b.raw_frequency,
    uni1.unigram as "n1", uni2.unigram as "n2", uni.unigram as "n3",
	uni1.morph as "n1_morph", uni2.morph as "n2_morph", uni.morph as "n3_morph"
FROM
    (SELECT tri.bigram, tri.token, tri.raw_frequency
    FROM cybercat.3grams as tri
	ORDER BY tri.raw_frequency DESC
    LIMIT 20000) as b
LEFT JOIN cybercat.2grams bi on bi.id_bigram = b.bigram
LEFT JOIN cybercat.unigrams uni on uni.id_unigram = b.token
LEFT JOIN cybercat.unigrams uni1 on uni1.id_unigram = bi.wordform_1
LEFT JOIN cybercat.unigrams uni2 on uni2.id_unigram = bi.wordform_2) as matches

where
    (matches.n3_morph LIKE "%Animacy%" AND
	 (matches.n2_morph = "_" AND
         (matches.n2 NOT IN ('NUM', '', ',', '…', ':', '(', ')', '"', '©'))) AND
		(matches.n1_morph LIKE "%VerbForm%" OR
		 matches.n1_morph LIKE "%Animacy%"))
LIMIT 300
```

### SQL statement for trigrams (lemmas):
```
SELECT raw_frequency, pos1, pos2, pos3, l1, l2, l3
FROM (
	SELECT tokens.raw_frequency,
		lemma1.lemma as "l1", lemma2.lemma as "l2", lemma3.lemma as "l3",
		p1.pos as "pos1", p2.pos as "pos2", p3.pos as "pos3"
	FROM (
		SELECT b.raw_frequency,
			uni1.lemma as "n1", uni2.lemma as "n2", uni.lemma as "n3"
		FROM
			(SELECT tri.bigram, tri.token, tri.raw_frequency
			FROM cybercat.3grams as tri
			ORDER BY tri.raw_frequency DESC
			LIMIT 20000
			) as b
			
		LEFT JOIN cybercat.2grams bi on bi.id_bigram = b.bigram
		LEFT JOIN cybercat.unigrams uni on uni.id_unigram = b.token
		LEFT JOIN cybercat.unigrams uni1 on uni1.id_unigram = bi.wordform_1
		LEFT JOIN cybercat.unigrams uni2 on uni2.id_unigram = bi.wordform_2
	) as tokens

	LEFT JOIN cybercat.lemmas lemma1 ON tokens.n1 = lemma1.id_lemmas
	LEFT JOIN cybercat.lemmas lemma2 ON tokens.n2 = lemma2.id_lemmas
	LEFT JOIN cybercat.lemmas lemma3 ON tokens.n3 = lemma3.id_lemmas
	LEFT JOIN cybercat.pos p1 ON p1.id_pos = lemma1.id_pos
	LEFT JOIN cybercat.pos p2 ON p2.id_pos = lemma2.id_pos
	LEFT JOIN cybercat.pos p3 ON p3.id_pos = lemma3.id_pos
) as matches

WHERE 				
	matches.pos1 IN ('NOUN', 'VERB') AND
	matches.pos2 = 'ADP' AND matches.pos3 = 'NOUN'
LIMIT 300
```

### Generate incorrect collocations through the [CrossLexica](https://www.xl.gelbukh.com/) database
**Note:** The code below is the same code as `src/generate_wrong_collocations.py`


In [None]:
import codecs
import configparser
import os
import pandas as pd

from bs4 import BeautifulSoup, SoupStrainer
from pathlib import Path
from seleniumrequests import Firefox
from selenium import webdriver


# Set up the configuration
path_current_directory = os.path.dirname(__file__)
path_config_file = os.path.join(path_current_directory, '../',
                                'config.ini')
config = configparser.ConfigParser()
config.read(path_config_file)
data_dir = config['PATHS']['data_dir']

options = webdriver.FirefoxOptions()
options.add_argument('start-maximized')
options.add_argument('--headless')
browser = Firefox(options=options, executable_path='C:/Program Files/geckodriver')


def get_synonyms(query: str):
    response = browser.request('POST', 'https://www.xl.gelbukh.com/', data={"query": query})
    full_html = response.text

    soup = BeautifulSoup(full_html, 'html.parser')

    # extract only lis
    lis = soup.find_all('li')
    synonyms = set()
    read = False
    for li in lis:
        # if there is an li with class noborder, see if it has an h3 with <font ...>Синонимы</font>
        li_a = li.find('a')
        if li_a and li_a.has_attr('name'):
            if li_a['name'] == "F_SYN":
                read = True
                continue
            else:
                read = False

        # Store subsequent lis if they don't begin with a whitespace
        if read and li.string and li.find('a'):
            synonym = li.find('a').string
            if synonym != query:
                synonyms.add(synonym)
    return list(synonyms)

synonyms = {}
bigrams = pd.read_csv(str(Path(data_dir, 'bigram_lemma_raw.csv')), encoding='utf8')

with codecs.open(str(Path(data_dir, 'wrong_bigrams.txt')), 'w+', encoding='utf-8') as out_file:
    out_file.write(",".join(["raw_frequency", "pos1", "pos2", "l1", "l2", "syn1", "syn2"]))
    for index, row in bigrams.iterrows():
        if row['l1'] not in synonyms:
            synonyms[row['l1']] = get_synonyms(row['l1'])
        if row['l2'] not in synonyms:
            synonyms[row['l2']] = get_synonyms(row['l2'])

        for synonym in synonyms[row['l1']]:
            out_file.write("\n" + ",".join([str(row['raw_frequency']), row["pos1"], row["pos2"],
                                            row["l1"], row["l2"], synonym, row["l2"]]))
        for synonym in synonyms[row['l2']]:
            out_file.write("\n" + ",".join([str(row['raw_frequency']), row["pos1"], row["pos2"],
                                            row["l1"], row["l2"], row['l1'], synonym]))

trigrams = pd.read_csv(str(Path(data_dir, 'trigram_lemma_raw.csv')), encoding='utf8')
with codecs.open(str(Path(data_dir, 'wrong_trigrams.txt')), 'w+', encoding='utf-8') as out_file:
    out_file.write(",".join(["raw_frequency", "pos1", "pos2", "pos3",
                             "l1", "l2", "l3", "syn1", "syn2", "syn3"]))
    for index, row in trigrams.iterrows():
        if row['l1'] not in synonyms:
            synonyms[row['l1']] = get_synonyms(row['l1'])
        if row['l3'] not in synonyms:
            synonyms[row['l3']] = get_synonyms(row['l3'])

        for synonym in synonyms[row['l1']]:
            out_file.write("\n" + ",".join([str(row['raw_frequency']), row["pos1"], row["pos2"], row["pos3"],
                                            row["l1"], row["l2"], row["l3"],
                                            synonym, row["l2"], row["l3"]]))
        for synonym in synonyms[row['l3']]:
            out_file.write("\n" + ",".join([str(row['raw_frequency']), row["pos1"], row["pos2"], row["pos3"],
                                            row["l1"], row["l2"], row["l3"],
                                            row["l1"], row["l2"], synonym]))
