Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Kurdish Wiktionary (ku) #19

Merged
merged 29 commits into from Dec 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
511800b
Initial commit for Kurdish Wiktionary support
pamputt Aug 11, 2021
5cec873
SUMMARY in Kurdish language
pamputt Aug 12, 2021
5271e05
Fix comments
pamputt Aug 12, 2021
a9a5134
Continue Kurdish Wiktionary support
pamputt Aug 20, 2021
26dbcf1
Working prototype
pamputt Oct 14, 2021
2706cd9
Remove unneeded extra blank lines
pamputt Oct 16, 2021
7171b27
Use learning place instead of place of residence if available
pamputt Oct 16, 2021
32bd1a6
Initial commit for Kurdish Wiktionary support
pamputt Aug 11, 2021
bfef038
SUMMARY in Kurdish language
pamputt Aug 12, 2021
701b21e
Fix comments
pamputt Aug 12, 2021
65cf074
Continue Kurdish Wiktionary support
pamputt Aug 20, 2021
537746d
Working prototype
pamputt Oct 14, 2021
9c08ec3
Remove unneeded extra blank lines
pamputt Oct 16, 2021
973159e
Use learning place instead of place of residence if available
pamputt Oct 16, 2021
35b4e3d
Add KuWiktionary to the other other Wiki projects
pamputt Oct 16, 2021
cdeb254
SUMMARY in Kurdish language
pamputt Aug 12, 2021
6fd2c90
Fix comments
pamputt Aug 12, 2021
8d1d9df
Add KuWiktionary with all other Wikimedia projects
pamputt Oct 16, 2021
8576683
Working prototype
pamputt Oct 14, 2021
3894b2e
Remove unneeded extra blank lines
pamputt Oct 16, 2021
3cc80ad
Use place of learning instead of place of residence if available
pamputt Oct 16, 2021
54549c8
Merge branch 'kuwiktionary' of https://github.com/lingua-libre/Lingua…
pamputt Oct 16, 2021
b46ab30
Support of Kurdish Wiktionary
pamputt Oct 16, 2021
9f32087
Handle Java TimeoutException
pamputt Oct 17, 2021
9dc51c7
Do not display the country if this is a Kurdish language section
pamputt Oct 17, 2021
413277d
Remove one extra line before the pronunciation section
pamputt Oct 17, 2021
1575170
Merge branch 'master' into kuwiktionary
pamputt Oct 17, 2021
e3c1759
Add comments to explain why two location maps
pamputt Oct 17, 2021
4dfa041
Merge branch 'kuwiktionary' of https://github.com/lingua-libre/Lingua…
pamputt Oct 17, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -7,6 +7,7 @@ Lingua Libre Bot is able to contribute on the following Wikimedia projects:
* Wikidata
* Wikidata Lexemes
* French Wiktionary
* Kurdish Wiktionary
* Occitan Wiktionary
* Shawiya Wiktionary

Expand Down
2 changes: 2 additions & 0 deletions llbot.py
Expand Up @@ -13,6 +13,7 @@

from wikis.wikidata.wikidata import Wikidata
from wikis.wiktionaries.frwiktionary import FrWiktionary
from wikis.wiktionaries.kuwiktionary import KuWiktionary
from wikis.wiktionaries.ocwiktionary import OcWiktionary
from wikis.wiktionaries.shywiktionary import ShyWiktionary
from wikis.wikidata.lexemes import Lexemes
Expand All @@ -32,6 +33,7 @@ def main():
"wikidatawiki": Wikidata(user, password),
"lexemes": Lexemes(user, password),
"frwiktionary": FrWiktionary(user, password),
"kuwiktionary": KuWiktionary(user, password),
"ocwiktionary": OcWiktionary(user, password),
"shywiktionary": ShyWiktionary(user, password),
}
Expand Down
6 changes: 3 additions & 3 deletions sparql.py
Expand Up @@ -96,7 +96,7 @@ def request(self, query):
error = error[pos1:pos2].strip()
print(f"MalformedQueryException: {error}")
return ""

''' TimeoutException
java.util.concurrent.TimeoutException
at java.util.concurrent.FutureTask.get(FutureTask.java:205)
Expand All @@ -111,8 +111,8 @@ def request(self, query):
pos2 = response.text.find("\n",pos1)
error = error[pos1:pos2].strip()
print(f"TimeoutException: {error}")
return ""
return ""

return json.loads(response.text)["results"]["bindings"]

def format_value(self, sparql_result, key):
Expand Down
283 changes: 283 additions & 0 deletions wikis/wiktionaries/kuwiktionary.py
@@ -0,0 +1,283 @@
#!/usr/bin/python3.8
# -*- coding: utf-8 -*-
# Author: Pamputt
# Date: 28 September 2021
# License: GNU GPL v2+

#NOTE:
#python3 llbot.py --wiki kuwiktionary --dryrun simple --langwm ku --item Q379244
#page pour tester l'ajout de section « pron »: porbirr (Q372968)
#page contenant déjà une section « pron » : gûz (Q379244)

import re
import wikitextparser as wtp

from sparql import Sparql
from wikis.wiktionary import Wiktionary

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
SUMMARY = "Dengê bilêvkirinê ji Lingua Libre lê hat zêdekirin"

# Do not remove the $1, it is used to force the section to have a content
EMPTY_PRONUNCIATION_SECTION = "=== Bilêvkirin ===\n$1"
PRONUNCIATION_LINE = "\n* {{deng|$2|$1|Deng|dever=$3}}\n"

LANGUAGE_QUERY = "SELECT ?item ?code WHERE { ?item wdt:P305 ?code. }"
LOCATION_QUERY = """
SELECT ?location ?locationLabel ?countryLabel
WHERE {
?location wdt:P17 ?country.
SERVICE wikibase:label { bd:serviceParam wikibase:language "ku,en" . }
VALUES ?location { wd:$1 }
}
"""


class KuWiktionary(Wiktionary):

def __init__(self, user, password):
"""
Constructor.

Parameters
----------
user
Username to login to the wiki.
password
Password to log into the account.
"""
super().__init__(user, password, "ku", SUMMARY)

"""
Public methods
"""

# Prepare the records to be added on the Kurdish Wiktionary:
# - Fetch the needed language code map (Qid -> BCP 47, used by kuwiktionary)
# - Get the labels of the speaker's location in Kurdish
def prepare(self, records):
sparql = Sparql(SPARQL_ENDPOINT)

# Get BCP 47 language code map
self.language_code_map = {}
raw_language_code_map = sparql.request(LANGUAGE_QUERY)

for line in raw_language_code_map:
self.language_code_map[
sparql.format_value(line, "item")
] = sparql.format_value(line, "code")

# Extract all different locations
locations = set()
for record in records:
if record["language"]["learning"] is not None:
locations.add(record["language"]["learning"])
if record["speaker"]["residence"] is not None:
locations.add(record["speaker"]["residence"])

# Prepare two location maps
# One that contains both the city and the country (for all languages but Kurdish)
# One that contains only the city (only for the Kurdish language)
self.location_map = {}
self.location_map_with_country = {}
raw_location_map = sparql.request(
LOCATION_QUERY.replace("$1", " wd:".join(locations))
)
for line in raw_location_map:
country = sparql.format_value(line, "countryLabel")
location = sparql.format_value(line, "locationLabel")
self.location_map[sparql.format_value(line, "location")] = location
self.location_map_with_country[sparql.format_value(line, "location")] = country
if country != location:
self.location_map_with_country[sparql.format_value(line, "location")] += (
" (" + location + ")"
)

return records

# Try to use the given record on the Kurdish Wiktionary
def execute(self, record):
transcription = record["transcription"]

# Fetch the content of the page having the transcription for title
(is_already_present, wikicode, basetimestamp) = self.get_entry(
transcription, record["file"]
)

# Whether there is no entry for this record on kuwiktionary
if not wikicode:
return False

# Whether the record is already inside the entry
if is_already_present:
print(record["id"] + "//" + transcription + ": already on kuwiktionary")
return False

# Try to extract the section of the language of the record
language_section = self.get_language_section(
wikicode, record["language"]["qid"]
)

# Whether there is no section for the current language
if language_section is None:
print(record["id"] + "//" + transcription + ": language section not found")
return False

# Try to extract the pronunciation subsection
pronunciation_section = self.get_pronunciation_section(language_section)

# Create the pronunciation section if it doesn't exist
if pronunciation_section is None:
pronunciation_section = self.create_pronunciation_section(language_section)

# Choose the location to be displayed with the following order
# 1) place of learning
# 2) place of residence
location = ""
if record["language"]["learning"]:
location = record["language"]["learning"]
else:
location = record["speaker"]["residence"]

# Add the pronunciation file to the pronunciation subsection
self.append_file(
pronunciation_section,
record["file"],
record["language"]["qid"],
location
)

# Save the result
try:
result = self.do_edit(transcription, wikicode, basetimestamp)
except Exception as e:
# If we got an editconflict, just restart from the beginning
if str(e).find("editconflict") > -1:
self.execute(record)
else:
raise e

if result:
print(
record["id"] + "//" + transcription
+ ": added to kuwiktionary - https://ku.wiktionary.org/wiki/"
+ transcription
)

return result

"""
Private methods
"""

# Try to extract the language section
def get_language_section(self, wikicode, language_qid):
# Check if the record's language has a BCP 47 code, stop here if not
if language_qid not in self.language_code_map:
return None

lang = self.language_code_map[language_qid]

# Travel across each sections titles to find the one we want
for section in wikicode.sections:
if section.title is None:
continue

if section.title.replace(" ", "").lower() == "{{ziman|" + lang + "}}":
return section

# If we arrive here, it means that there is no section for
# the record's language
return None

# Try to extract the pronunciation subsection
def get_pronunciation_section(self, wikicode):
for section in wikicode.sections:
if section.title is None:
continue

if section.title.replace(" ", "").lower() == "bilêvkirin":
return section

return None

# Create a pronunciation subsection
def create_pronunciation_section(self, wikicode):
# The pronunciation section is the first one of the language section
# It comes just after "=={{ziman|qqq}}=="
lang_section = wikicode.sections[0]
for section in wikicode.sections:
if section.title is None:
continue

# Search for the language section
if re.search(r'\{\{ziman\|[a-z]+\}\}', section.title.replace(" ", "")):
break

lang_section = section

# Add a new line before the pronunciation section only
# if there is no other section
section_content = wtp.parse(wikicode.sections[1].contents)
new_section = EMPTY_PRONUNCIATION_SECTION
if len(section_content.sections) < 2:
new_section = new_section.replace("=== Bilêvkirin","\n=== Bilêvkirin")

# Append an empty pronunciation section just after the language section
pattern = r"==="
lang_section.contents = self.safe_append_text(
lang_section.contents, new_section, pattern
)

return self.get_pronunciation_section(wikicode)

# Add the audio template to the pronunciation section
def append_file(self, wikicode, filename, language_qid, location_qid):
section_content = wtp.parse(wikicode.sections[1].contents)

location = ""
if (language_qid == "Q36368" and # Kurdish language on Wikidata
location_qid in self.location_map):
location = self.location_map[location_qid]

if (language_qid != "Q36368" and
location_qid in self.location_map_with_country):
location = self.location_map_with_country[location_qid]

pronunciation_line = PRONUNCIATION_LINE.replace("$1", filename).replace("$2", self.language_code_map[
language_qid]).replace("$3", location)
# Add new lines if there are sections after
if len(section_content.sections) > 1:
pronunciation_line += "\n\n"

pattern = r"==="
section_content.sections[0].contents = self.safe_append_text(
section_content.sections[0].contents,
pronunciation_line,
pattern
)

wikicode.sections[1].contents = str(section_content)

# Remove the ugly hack, see comment line 17
wikicode.sections[1].contents = wikicode.sections[1].contents.replace(
"$1\n", ""
)

# Remove unneeded blank lines
wikicode.sections[1].contents = wikicode.sections[1].contents.replace(
"\n\n", ""
)

# Append a string to a wikitext string, just after the language section
# (before any section)
def safe_append_text(self, content, text, pattern):
content = str(content)

search = re.compile(pattern).search(content)
if search:
index = search.start()
else:
index = len(content)

return content[:index] + text + content[index:]