In [1]:
!pip install wikipedia
import wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=76899f472f269fb9ace54733821b9276160eb41bfd4dda53f8b060815dff49b1
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [2]:
wikipedia.set_lang("en")
article = wikipedia.page("Artificial intelligence")
text = article.content

In [3]:
# Clean article, I keep some special characters, as they might be useful for DeepL translations,
# and will be removed when tokenizing the input
import re
text = re.sub(r"=.*=", "", text)
text = text.replace('\n', '')
text = text.replace('\\', '')
text = re.sub(r"(\.)([A-Z])", r"\1 \2", text)

In [4]:
text

'Artificial intelligence (AI) is intelligence—perceiving, synthesizing, and inferring information—demonstrated by machines, as opposed to intelligence displayed by humans or by other animals. Example tasks in which this is done include speech recognition, computer vision, translation between (natural) languages, as well as other mappings of inputs. AI applications include advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or creative tools (ChatGPT and AI art), automated decision-making, and competing at the highest level in strategic game systems (such as chess and Go). As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. For instance, optical character recognition is frequently excluded from things considered to be 

In [None]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
# use method from nltk to split text into sentences
lines = sent_tokenize(text)

In [10]:
lines

['Artificial intelligence (AI) is intelligence—perceiving, synthesizing, and inferring information—demonstrated by machines, as opposed to intelligence displayed by humans or by other animals.',
 'Example tasks in which this is done include speech recognition, computer vision, translation between (natural) languages, as well as other mappings of inputs.',
 'AI applications include advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or creative tools (ChatGPT and AI art), automated decision-making, and competing at the highest level in strategic game systems (such as chess and Go).',
 'As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect.',
 'For instance, optical character recognition is frequently excluded from things 

In [11]:
KEY = "1f3e5a91-c43f-c224-5960-52e4f7763cc9:fx"

In [12]:
!pip install deepl
import deepl 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepl
  Downloading deepl-1.14.0-py3-none-any.whl (39 kB)
Installing collected packages: deepl
Successfully installed deepl-1.14.0


In [13]:
# translate article to polish using DeepL
translator = deepl.Translator(KEY) 
translations = []
for line in lines:
  result = translator.translate_text(line, source_lang="EN", target_lang="PL") 
  translated_text = result.text
  translations.append(translated_text)

In [30]:
# create DataFrame
import pandas as pd
df = pd.DataFrame({"english": lines,
                   "polish": translations})
df.head()

Unnamed: 0,english,polish
0,Artificial intelligence (AI) is intelligence—p...,Sztuczna inteligencja (AI) to inteligencja - p...
1,Example tasks in which this is done include sp...,"Przykładowe zadania, w których jest to wykonyw..."
2,AI applications include advanced web search en...,Zastosowania sztucznej inteligencji obejmują z...
3,"As machines become increasingly capable, tasks...",W miarę jak maszyny stają się coraz bardziej w...
4,"For instance, optical character recognition is...",Na przykład optyczne rozpoznawanie znaków jest...


In [31]:
# while looking at the data, I noticed rows 350, 352 and 353 do not make sense in terms of translation
df.loc[[350, 352, 353], :]

Unnamed: 0,english,polish
350,"Thomason, Richmond.","Thomason, Richmond."
352,"In Zalta, Edward N.","W Zalta, Edward N."
353,(ed.).,(red.).


In [32]:
df.drop([350, 352, 353], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.tail(10)

Unnamed: 0,english,polish
344,Several works use AI to force us to confront t...,Kilka dzieł wykorzystuje sztuczną inteligencję...
345,"This appears in Karel Čapek's R. U. R., the fi...","Pojawia się to w R. U. R. Karela Čapka, filmac..."
346,"Artificial Intelligence and Ex Machina, as wel...","Sztuczna inteligencja i Ex Machina, a także po..."
347,Dick considers the idea that our understanding...,"Dick rozważa pomysł, że nasze rozumienie ludzk..."
348,AI safety – Research area on making AI safe an...,Bezpieczeństwo AI - obszar badań nad uczynieni...
349,Internet Encyclopedia of Philosophy.,Internetowa encyklopedia filozofii.
350,"""Logic and Artificial Intelligence"".","""Logika i sztuczna inteligencja""."
351,Stanford Encyclopedia of Philosophy.,Stanford Encyclopedia of Philosophy.
352,Artificial Intelligence.,Sztuczna inteligencja.
353,"BBC Radio 4 discussion with John Agar, Alison ...","Dyskusja w BBC Radio 4 z udziałem Johna Agara,..."


In [33]:
# save to csv
df.to_csv("wikipedia_translation.csv")