In [2]:
import requests, re, nltk, stopwordsiso, sklearn
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# English
# UD corpus
with open('en_partut-ud-train.txt') as eng_f:
    eng_text = eng_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_english_text(text):
    
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Convert all characters to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwords]
    
    # Perform stemming or lemmatization
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Or
    #lemmatizer = nltk.stem.WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# Russian
# UD corpus
with open('ru_pud-ud-test.txt') as rus_f:
    rus_text = rus_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_russian_text(text):
    
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Convert all characters to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('russian')
    tokens = [token for token in tokens if token not in stopwords]
    
    # Perform stemming or lemmatization
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Or
    #lemmatizer = nltk.stem.WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# Arabic
# UD corpus
with open('ar_pud-ud-test.txt') as ar_f:
    ar_text = ar_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_arabic_text(text):
    
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Convert all characters to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('arabic')
    tokens = [token for token in tokens if token not in stopwords]
    
    # Perform stemming or lemmatization
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Or
    #lemmatizer = nltk.stem.WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# Mandarin Chinese
# UD corpus
with open('zh_pud-ud-test.txt') as zh_f:
    zh_text = zh_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_chinese_text(text):
    
    # Remove non-alphabetic characters
    # text = re.sub("[^a-zA-Z]", " ", text)
    # text = re.sub(r'[^\u4e00-\u9fff]+', '', text)
    
    # Tokenize the text
    # tokens = nltk.tokenize.word_tokenize(text)
    
    # Tokenize with Jieba
    tokens = "|".join(jieba.cut(zh_text, cut_all=False, HMM=True))
    
    # Remove stopwords
    # stopwords = stopwordsiso.stopwords("zh")
    # tokens = [token for token in tokens if token not in stopwords]
    
    return tokens

In [None]:
# instantiate the CountVectorizer class
vectorizer = CountVectorizer(ngram_range=(2, 2))

# fit the vectorizer on the tokenized text data
X = vectorizer.fit_transform(tokens)

# get the feature names
feature_names = vectorizer.get_feature_names()


In [None]:
# German
# UD corpus
with open('de_pud-ud-test.txt') as de_f:
    de_text = de_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_german_text(text):
    
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Convert all characters to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('german')
    tokens = [token for token in tokens if token not in stopwords]
    
    # Perform stemming or lemmatization
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Or
    #lemmatizer = nltk.stem.WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# Japanese
# UD corpus
with open('ja_pud-ud-test.txt') as ja_f:
    ja_text = ja_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_japanese_text(text):
    
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Convert all characters to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('japanese')
    tokens = [token for token in tokens if token not in stopwords]
    
    # Perform stemming or lemmatization
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Or
    #lemmatizer = nltk.stem.WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# Hindi
# UD corpus
with open('hi_pud-ud-test.txt') as hi_f:
    hi_text = hi_f.read().replace('\n', '')
    
# Preprocessing
def preprocess_hindi_text(text):
    
    # Remove non-alphabetic characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Convert all characters to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stopwords = stopwordsiso.stopwords('hi')
    tokens = [token for token in tokens if token not in stopwords]
    
    # Perform stemming or lemmatization
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Or
    #lemmatizer = nltk.stem.WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens