In [1]:
# Import libraries
import os
import re
import nltk
import pandas as pd

**NOTE:** If this is your first time using the `nltk` library, uncomment the following cell to download the necessary modules. For this notebook, you will need to download the *stopwords* folder.

In [2]:
# nltk.download()

Link to [dataset](https://www.kaggle.com/zarajamshaid/language-identification-datasst) on Kaggle.

Link to [original dataset](https://zenodo.org/record/841984) from Zenodo.

The dataset contains 1,000 rows for 235 languages (235,000 data points in total).

In [3]:
df = pd.read_csv('./wili-2018/labels.csv', sep=';')
df.head()

Unnamed: 0,Label,English,Wiki Code,ISO 369-3,German,Language family,Writing system,Remarks,Synonyms
0,ace,Achinese,ace,ace,Achinesisch,Austronesian,,,
1,afr,Afrikaans,af,afr,Afrikaans,Indo-European,,,
2,als,Alemannic German,als,gsw,Alemannisch,Indo-European,,(ursprünglich nur Elsässisch),
3,amh,Amharic,am,amh,Amharisch,Afro-Asiatic,,,
4,ang,Old English,ang,ang,Altenglisch,Indo-European,,(ca. 450-1100),Angelsächsisch


# Read Data From Text File


In [4]:
def read_data(filename):
    data = []
    with open(f'./wili-2018/{filename}', 'r', encoding="utf8") as f:
        for line in f:
            data.append(line)
    return data

## Paragraphs
We will save the training and testing paragraphs to their own list

In [5]:
# Training set
raw_training_sentences = read_data('x_train.txt')

# Testing set
raw_testing_sentences = read_data('x_test.txt')

In [6]:
raw_training_sentences[:2]

['Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald.\n',
 'Sebes, Joseph; Pereira Thomas (1961) (på eng). The Jesuits and the Sino-Russian treaty of Nerchinsk (1689): the diary of Thomas Pereira. Bibliotheca Instituti historici S. I., 99-0105377-3 ; 18. Rome. Libris 677492\n']

In [7]:
raw_testing_sentences[:2]

["Ne l fin de l seclo XIX l Japon era inda çconhecido i sótico pa l mundo oucidental. Cula antroduçon de la stética japonesa, particularmente na Sposiçon Ounibersal de 1900, an Paris, l Oucidente adquiriu un apetite ansaciable pul Japon i Heiarn se tornou mundialmente coincido pula perfundidade, ouriginalidade i sinceridade de ls sous cuntos. An sous radadeiros anhos, alguns críticos, cumo George Orwell, acusórun Heiarn de trasferir sou nacionalismo i fazer l Japon parecer mais sótico, mas, cumo l'home qu'oufereciu al Oucidente alguns de sous purmeiros lampeijos de l Japon pré-andustrial i de l Período Meiji, sou trabalho inda ye balioso até hoije.\n",
 'Schiedam is gelegen tussen Rotterdam en Vlaardingen, oorspronkelijk aan de Schie en later ook aan de Nieuwe Maas. Per 30 april 2017 had de gemeente 77.833 inwoners (bron: CBS). De stad is vooral bekend om haar jenever, de historische binnenstad met grachten, en de hoogste windmolens ter wereld.\n']

## Language Labels
We will save the training and testing labels to their own list

In [8]:
# Training set
raw_training_language = read_data('y_train.txt')

# Testing set
raw_testing_language = read_data('y_test.txt')

In [9]:
raw_training_language[:5]

['est\n', 'swe\n', 'mai\n', 'oci\n', 'tha\n']

In [10]:
raw_testing_language[:5]

['mwl\n', 'nld\n', 'ava\n', 'tcy\n', 'bjn\n']

## Remove New Line Character (`/n`)
All the lists end with `/n` so we will write a function to remove it 

In [11]:
def remove_newline(text):
    return [txt[:-1] for txt in text]

In [12]:
# Training data
training_sentences = remove_newline(raw_training_sentences)
training_language  = remove_newline(raw_training_language)

# Testing data
testing_sentences  = remove_newline(raw_testing_sentences)
testing_language   = remove_newline(raw_testing_language)

In [13]:
print(f"""
Training set (Sentence): {len(training_sentences)}
Training set (Language): {len(training_language)}

Testing set (Sentence): {len(testing_sentences)}
Testing set (Language): {len(testing_language)}
""")


Training set (Sentence): 117500
Training set (Language): 117500

Testing set (Sentence): 117500
Testing set (Language): 117500



In [14]:
# Training set
df_train = pd.DataFrame(data={'sentence': training_sentences, 'language_code':training_language})

# Testing set
df_test = pd.DataFrame(data={'sentence': testing_sentences, 'language_code':testing_language})

In [15]:
df_train

Unnamed: 0,sentence,language_code
0,Klement Gottwaldi surnukeha palsameeriti ning ...,est
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
2,भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...,mai
3,"Après lo cort periòde d'establiment a Basilèa,...",oci
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...,tha
...,...,...
117495,Nekoliko prašćića je rođeno na farmi Arableovi...,bos
117496,Tahiti of Otaheite is 'n eilandj in 't zuje va...,lim
117497,同年，太后崩。絳侯周勃、陳平諸臣共謀誅呂。朱虛侯章已殺呂產，文帝使人持節勞章。朱虛侯欲奪節信...,lzh
117498,I det mindste opnåede Venedig den 18. april 14...,dan


In [16]:
df_test

Unnamed: 0,sentence,language_code
0,Ne l fin de l seclo XIX l Japon era inda çconh...,mwl
1,Schiedam is gelegen tussen Rotterdam en Vlaard...,nld
2,"ГIурусаз батальонал, гьоркьор гIарадабиги лъун...",ava
3,ರಾಜ್ಯಶಾಸ್ತ್ರದ ಪಿತಾಮಹೆ ಅರಿಸ್ಟಾಟಲ್. ರಾಜ್ಯಶಾಸ್ತ್ರ...,tcy
4,Halukum adalah kelenjar tiroid nang menonjol d...,bjn
...,...,...
117495,"Wakati wa mimba,homa ya Q ni vigumu kutibu kwa...",swa
117496,گیلون یک ته تاریخی منطقه‌ سفیدرود دلتای طرف ای...,glk
117497,តាម​រយៈ​ការ​ចិញ្ចឹម​មនោសញ្ចេតនា​ជាតិនិយម​បែប​ន...,khm
117498,روس اک وفاق اے تے 1 مارچ 2008ء توں اسدیاں 83 و...,pnb


# Replace Language Code
We want the full name of the language instead of the language code. We can find the full name from the `df` DataFrame.

We can combine the two DataFrames and get the relevant columns

In [17]:
df.head()

Unnamed: 0,Label,English,Wiki Code,ISO 369-3,German,Language family,Writing system,Remarks,Synonyms
0,ace,Achinese,ace,ace,Achinesisch,Austronesian,,,
1,afr,Afrikaans,af,afr,Afrikaans,Indo-European,,,
2,als,Alemannic German,als,gsw,Alemannisch,Indo-European,,(ursprünglich nur Elsässisch),
3,amh,Amharic,am,amh,Amharisch,Afro-Asiatic,,,
4,ang,Old English,ang,ang,Altenglisch,Indo-European,,(ca. 450-1100),Angelsächsisch


In [18]:
df_train.head()

Unnamed: 0,sentence,language_code
0,Klement Gottwaldi surnukeha palsameeriti ning ...,est
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
2,भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...,mai
3,"Après lo cort periòde d'establiment a Basilèa,...",oci
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...,tha


In [19]:
train_df = pd.merge(left=df_train, right=df, how='inner', 
                    left_on='language_code', right_on='Label')[['sentence', 'English']].copy()

test_df = pd.merge(left=df_test, right=df, how='inner', 
                   left_on='language_code', right_on='Label')[['sentence', 'English']].copy()

In [20]:
train_df.head()

Unnamed: 0,sentence,English
0,Klement Gottwaldi surnukeha palsameeriti ning ...,Estonian
1,"Dorota Rabczewska, artistinimega Doda (sündinu...",Estonian
2,"Merilai, A., Maria-Kristiina Lotman (toim) (20...",Estonian
3,Teda on süüdistatud holokausti eitamises - 199...,Estonian
4,Neile on omistatud Vana-Kreeka laadide nimetus...,Estonian


In [21]:
test_df.head()

Unnamed: 0,sentence,English
0,Ne l fin de l seclo XIX l Japon era inda çconh...,Mirandese
1,Modelo:Anfo/Asteroide 1997 AF22 (asteroide 294...,Mirandese
2,"Ne l Oucidente, alguns trechos de l libro apar...",Mirandese
3,"Na berdade, esse prémio nun ten ligaçon cun Al...",Mirandese
4,"L'anterro fui rializado an 30 d'outubre, adond...",Mirandese


# Combine All Data
We're going to combine all the data into a single DataFrame. We will split it again into a train and test set once we reduce the number of languages we use for this project. Currently there are 235 languages, but we will only use a subset of these languages.

In [22]:
full_df = pd.concat(objs=[train_df, test_df], axis=0, ignore_index=True)

full_df.rename(columns={'sentence':'text', 'English':'language'}, inplace=True)

In [23]:
full_df.head()

Unnamed: 0,text,language
0,Klement Gottwaldi surnukeha palsameeriti ning ...,Estonian
1,"Dorota Rabczewska, artistinimega Doda (sündinu...",Estonian
2,"Merilai, A., Maria-Kristiina Lotman (toim) (20...",Estonian
3,Teda on süüdistatud holokausti eitamises - 199...,Estonian
4,Neile on omistatud Vana-Kreeka laadide nimetus...,Estonian


In [24]:
full_df.shape

(234000, 2)

## Notes on the dataset
- `nltk.word_tokenize` currently only supports English. This is not a very important issue since most of the langauges have spaces, so they can still be tokenized.
- Languages that do not use spaces to separate words (such as Chinese) are greatly affected by this.

For this project, I will only keep the languages that `nltk` currently supports. An idea for a future project would be to include other languages in the dataset with packages that specialize in those languages.

### List of languages in the dataset

In [25]:
languages = full_df['language'].apply(lambda x: str.lower(x)).unique().tolist()

languages

['estonian',
 'swedish',
 'maithili',
 'occitan',
 'thai',
 'oromo',
 'limburgan',
 'gujarati',
 'western panjabi',
 'zeeuws',
 'karachay-balkar',
 'haitian creole',
 'picard',
 'tamil',
 'vietnamese',
 'panjabi',
 'silesian',
 'central kurdish',
 'friulian',
 'wu chinese',
 'egyptian arabic',
 'tongan',
 'basque',
 'banyumasan',
 'gilaki',
 'dutch',
 'tibetan',
 'japanese',
 'aragonese',
 'sardinian',
 'extremaduran',
 'sinhala',
 'kurdish',
 'chechen',
 'turkmen',
 'pangasinan',
 'turkish',
 'alemannic german',
 'komi-permyak',
 'latin',
 'urdu',
 'tatar',
 'buryat',
 'indonesian',
 'kirghiz',
 'cantonese',
 'danish',
 'portuguese',
 'french',
 'oriya',
 'bokmål',
 'lojban',
 'konkani',
 'amharic',
 'central khmer',
 'serbo-croatian',
 'slovene',
 'bosnian',
 'tetum',
 'standard chinese',
 'korean',
 'yakut',
 'aromanian',
 'asturian',
 'wolof',
 'bulgarian',
 'scottish gaelic',
 'malay',
 'crimean tatar',
 'luganda',
 'sundanese',
 'breton',
 'mongolian',
 'nepali (macrolanguage)',


### List of languages supported by `nltk`

**NOTE:** In order for this to work on your device, it is necessary to find the location where the stopwords are saved in your system and replace the path in quotes with the correct filepath from your system.

In [26]:
# # Windows
supported_languages = os.listdir("C:\\users\\johng\\appdata\\roaming\\nltk_data\\corpora\\stopwords")

# Mac OS
# supported_languages = os.listdir('/Users/johngonzalez/nltk_data/corpora/stopwords')

supported_languages

['arabic',
 'azerbaijani',
 'bengali',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'README',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

### Intersection between the two lists

In [27]:
# Find the intersection between lists
final_languages = list(set(languages) & set(supported_languages))
final_languages = [lang.capitalize() for lang in final_languages]

final_languages

['Slovene',
 'Danish',
 'Russian',
 'Dutch',
 'Spanish',
 'Portuguese',
 'Kazakh',
 'Bengali',
 'Turkish',
 'Azerbaijani',
 'Arabic',
 'Romanian',
 'French',
 'Tajik',
 'Finnish',
 'Swedish',
 'Indonesian',
 'Hungarian',
 'English',
 'German',
 'Italian']

In [28]:
print(f'We will be using {len(final_languages)} languages from the {len(languages)} languages in the dataset')

We will be using 21 languages from the 234 languages in the dataset


### Keep the rows with the supported languages

In [29]:
df = full_df[full_df.language.isin(final_languages)]
df = df.reset_index(drop=True)

# Note
I initially completed the project using the 21 languages, however, it included too many languages for the scope of this project. The more languages that were introduced, the less certain the algorithm was of its prediction. 

For this reason, I have decided to reduce the number of languages to those which I am most familiar with. In the future, I would like to create separate models for each family of languages such as Indo-European languages, Romance languages, Slavic languages, etc.

In [30]:
language_subset = ['English', 'Spanish', 'Italian', 'French', 'Portuguese', 
                   'German', 'Indonesian', 'Dutch', 'Swedish', 'Romanian']

df = df[df.language.isin(language_subset)]
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,text,language
0,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",Swedish
1,Theokritos herdediktning har tjänat som förebi...,Swedish
2,Gatunätet i Lunds stadskärna är på många ställ...,Swedish
3,Detta fortskred under första halvan av seklet....,Swedish
4,"Jesper Roine, född 1971, är en svensk national...",Swedish


In [31]:
df.language.value_counts()

Dutch         1000
Swedish       1000
Spanish       1000
German        1000
Portuguese    1000
French        1000
Romanian      1000
Indonesian    1000
Italian       1000
English       1000
Name: language, dtype: int64

In [32]:
print(f'There are a total of {sum(df.language.value_counts())} data points left.')

There are a total of 10000 data points left.


# Data Cleaning
- Remove numbers from the text
- Remove special characters from the text
- Remove any extra whitespaces from the text

In [33]:
def preprocess(text):
    text = ''.join([char for char in text if not char.isdigit()])
    text = re.sub("[\[\[\]\]\(\)?¿—\"\"«»,.;–:!]", "", text)  # Remove special characters
    text = text.replace('\u200b', '')
    text = re.sub("-", " ", text)  # Replace '-' with a space
    text = re.sub("'", " ", text)  # Replace " ' " with a space
    text = " ".join(text.split())  # Remove any extra spaces
    text = text.lower()
    
    return text

In [34]:
df['cleaned_text'] = df['text'].apply(lambda x: preprocess(x))

df.head()

Unnamed: 0,text,language,cleaned_text
0,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",Swedish,sebes joseph pereira thomas på eng the jesuits...
1,Theokritos herdediktning har tjänat som förebi...,Swedish,theokritos herdediktning har tjänat som förebi...
2,Gatunätet i Lunds stadskärna är på många ställ...,Swedish,gatunätet i lunds stadskärna är på många ställ...
3,Detta fortskred under första halvan av seklet....,Swedish,detta fortskred under första halvan av seklet ...
4,"Jesper Roine, född 1971, är en svensk national...",Swedish,jesper roine född är en svensk nationalekonom ...


# Save the DataFrame
Save the DataFrame as a CSV file and a pickle object

In [35]:
# CSV
df.to_csv("./saved-items/cleaned_df.csv", index=False)

# Pickle object
df.to_pickle("./saved-items/cleaned_df.pkl")