First, let's create a function that takes as a parameter a text file and outputs the dictionary of the letter frequency

In [353]:
import re
import codecs

def letters(text_file):
    with codecs.open(text_file, 'r') as file:
        # remove newline characters and lowercase all the letters
        data = file.read().replace('\n', '').lower()
        #remove all special characters and numbers; leave letters only
        data = re.sub('[^A-Za-zÀ-ÿ×Þß÷þø ]+', '', data)
        # sanity check print (data)
        #create a dictionary with all the letters and populate it
        letters = {}
        for i in data:
            if i in letters:
                letters[i] +=1
            else:
                letters[i] = 1
    #sort dict ascending from least freq letter to most freq
    letters = {k: v for k, v in sorted(letters.items(), key=lambda item: item[1])}
    return letters

print (letters('French Revolution - Wikipedia.txt'))

{'â': 1, 'ö': 1, 'à': 1, 'ë': 1, 'å': 1, 'ê': 2, 'ü': 3, 'ô': 4, 'æ': 5, 'è': 21, 'ç': 26, 'z': 143, 'é': 150, 'q': 169, 'x': 362, 'j': 381, 'k': 752, 'v': 1981, 'b': 2158, 'y': 2165, 'w': 2182, 'g': 2514, 'm': 3331, 'f': 3554, 'p': 3602, 'u': 4025, 'c': 5266, 'd': 5346, 'l': 6279, 'h': 6553, 's': 8271, 'r': 9922, 'a': 10366, 'i': 10612, 'n': 10698, 'o': 11104, 't': 12315, 'e': 17346, ' ': 24850}


Create a function that counts all the bigrams in the given text

In [365]:
import re

def bigrams(text_file):
    with open(text_file, 'r') as file:
        # remove newline characters and lowercase all the letters
        data = file.read().replace('\n', '').lower()
        #remove all special characters and numbers; leave letters only
        data = re.sub('[^A-Za-zÀ-ÿ×Þß÷þø ]+', '', data)
        # sanity check print (data)
        num = -1
        #create a dictionary with all bigrams and populate it
        bigrams = {}
        for i in data:
            num +=1
            #ensure to calculate bigrams that contain letters only(no empty space)
            if ' ' in data[num:num+2]:
                continue
            if data[num:num+2] in bigrams:
                bigrams[data[num:num+2]] += 1
            else:
                bigrams[data[num:num+2]] = 1
         #sort dict ascending from least freq letter to most freq
        bigrams = {k: v for k, v in sorted(bigrams.items(), key=lambda item: item[1])}
        return bigrams

#print (bigrams('French Revolution - Wikipedia.txt'))

Finally, create a function that calculates trigrams

In [366]:
import re
def trigrams(text_file):
    with open(text_file, 'r') as file:
        data = file.read().replace('\n', '').lower()
        data = re.sub('[^A-Za-zÀ-ÿ×Þß÷þø ]+', '', data)
        #print (data)
        num = -1
        trigrams = {}
        for i in data:
            num +=1
            if ' ' in data[num:num+3]:
                continue
            if data[num:num+3] in trigrams:
                trigrams[data[num:num+3]] += 1
            else:
                trigrams[data[num:num+3]] = 1
        trigrams = {k: v for k, v in sorted(trigrams.items(), key=lambda item: item[1])}
        return trigrams

#print (trigrams('French Revolution - Wikipedia.txt'))

Now, let's get the most "popular" items for each language

In [358]:
wiki = ['Revolución francesa - Wikipedia.txt','Französische Revolution – Wikipedia.txt','French Revolution - Wikipedia.txt','Revoluția franceză - Wikipedia.txt']
def get_size(funct,wiki):
    """Takes as argument a function and returns a list of dictionaries for all the
    letters/bigrams/trigrams in the language text corpus
    """
    l = []
    for i in wiki:
        l.append((i,funct(i)))
    return l
# sanity chech with lettes function
print (get_size(letters, wiki))

[('Revolución francesa - Wikipedia.txt', {'ë': 1, 'ü': 1, 'è': 7, 'ç': 8, 'ñ': 31, 'ú': 40, 'á': 76, 'k': 87, 'x': 90, 'z': 97, 'w': 108, 'é': 111, 'j': 174, 'í': 240, 'q': 251, 'h': 314, 'y': 364, 'f': 414, 'ó': 441, 'v': 491, 'g': 502, 'b': 721, 'm': 974, 'p': 1092, 'u': 1513, 't': 1733, 'c': 1968, 'd': 1978, 'l': 2627, 'r': 2725, 'n': 2731, 's': 2759, 'i': 2830, 'o': 3291, 'a': 4617, 'e': 5071, ' ': 6809}), ('Französische Revolution – Wikipedia.txt', {'ê': 1, 'â': 1, 'ë': 2, 'ô': 2, 'è': 15, 'ç': 29, 'q': 31, 'é': 81, 'x': 94, 'y': 101, 'ß': 160, 'j': 250, 'ö': 506, 'ä': 583, 'ü': 592, 'p': 1124, 'w': 1309, 'z': 1391, 'v': 1556, 'k': 1625, 'b': 1940, 'f': 2013, 'm': 2422, 'c': 2881, 'g': 3455, 'o': 3509, 'l': 4069, 'h': 4099, 'u': 4575, 'd': 5342, 'a': 5842, 't': 6417, 's': 6546, 'r': 8348, 'i': 8806, 'n': 10482, ' ': 15251, 'e': 16629}), ('French Revolution - Wikipedia.txt', {'â': 1, 'ö': 1, 'à': 1, 'ë': 1, 'å': 1, 'ê': 2, 'ü': 3, 'ô': 4, 'æ': 5, 'è': 21, 'ç': 26, 'z': 143, 'é': 15

Get the most popular 10 letters/bigrams/trigrams

In [359]:
def get_popular(all_list):
    all_languages = {}
    for i in all_list:
        a1_sorted_keys = sorted(i[1], key=i[1].get, reverse=True)
        popular = []
        for r in a1_sorted_keys:
            popular.append((r, i[1][r]))
        all_languages[i[0]] = (popular[:11])
    return all_languages

In [360]:
print (get_popular(get_size(letters, wiki)))
# print (get_popular(get_size(bigrams, wiki)))
# print(get_popular(get_size(trigrams, wiki)))

{'Revolución francesa - Wikipedia.txt': [(' ', 6809), ('e', 5071), ('a', 4617), ('o', 3291), ('i', 2830), ('s', 2759), ('n', 2731), ('r', 2725), ('l', 2627), ('d', 1978), ('c', 1968)], 'Französische Revolution – Wikipedia.txt': [('e', 16629), (' ', 15251), ('n', 10482), ('i', 8806), ('r', 8348), ('s', 6546), ('t', 6417), ('a', 5842), ('d', 5342), ('u', 4575), ('h', 4099)], 'French Revolution - Wikipedia.txt': [(' ', 24850), ('e', 17346), ('t', 12315), ('o', 11104), ('n', 10698), ('i', 10612), ('a', 10366), ('r', 9922), ('s', 8271), ('h', 6553), ('l', 6279)], 'Revoluția franceză - Wikipedia.txt': [(' ', 21179), ('i', 15139), ('e', 14679), ('a', 13461), ('r', 10685), ('n', 8171), ('t', 7599), ('u', 7564), ('l', 6655), ('o', 6208), ('c', 5467)]}


In [361]:
#print popular items in a pretty dictionary by language
def get_lang_popular_items (funct, wiki):
    from itertools import chain
    d = get_popular(get_size(funct, wiki))
    xs, ys = zip(*chain.from_iterable(d.values()))
    #in this order
    spanish = xs[:11]
    german = xs[11:22]
    english = xs[22:33]
    romanian = xs[33:]
    
    languages = {'spanish':spanish,
                 'german':german,
                 'english':english,
                 'romanian':romanian}
    return languages

print (get_lang_popular_items (bigrams, wiki))

{'spanish': ('de', 'es', 'la', 're', 'en', 'ci', 'er', 'ra', 'os', 'on', 'an'), 'german': ('en', 'er', 'ch', 'de', 'te', 'ie', 'ei', 'un', 're', 'ge', 'in'), 'english': ('th', 'he', 're', 'on', 'an', 'in', 'ti', 'er', 'en', 'ed', 'io'), 'romanian': ('re', 'ri', 'ar', 'de', 'in', 'er', 'ra', 'le', 'te', 'or', 'at')}


Letters:
    As we can see the most popular letters for 
        Spanish: e, a, o, i, s, n, r, l, d
        Romanian:i, e, a, r, n, t, u, l, o
        German:  e, n, i, r, s, t, a, d, u
        English: e, t, o, n, i, a, r, s, h
We can notice that for Latin languages vowls are generally more popular

Bigrams:
        As we can see the most popular bigrams for 
        Spanish: de, es, la, re, en, ci, er, la
        Romanian:re, ri, ar, de, in, er, re, le
        German:  en, er, ch, de, te, ei, en, un
        English: th, he, re, on, an, in, ti, er
Again, consonantes prevail in Germanic languages. also true for trigrams

Now let's take a text and try to identify its language based on the most popular letters/bigrams/trigrams present

In [362]:
#get the most popular word/bigra/trigram in text
def text_get_popular_items(funct, word_doc):
    from itertools import chain
    d = get_popular(get_size(funct, word_doc))
    xs, ys = zip(*chain.from_iterable(d.values()))
    return xs

In [363]:
#get an example
word_doc = ['alice_eng.txt']
alice = text_get_popular_items(bigrams, word_doc)
print (alice)

('he', 'th', 'in', 'er', 'an', 'ou', 'it', 'nd', 'at', 're', 'on')


In order to understrand what languages the text is refering to, we will compare the lists of the most frequent terms.The assumption that we will make is that the order of the items in the list is not imortant (BOW approach)

In [364]:
popular_bigrams = (get_lang_popular_items (bigrams, wiki))
score = []
# search for similarties between items and get the max similarity
for bigram in popular_bigrams:
    score.append(len((set(alice).intersection(popular_bigrams[bigram]))))
langs = list(popular_bigrams.keys())

detected_language = langs[(score.index(max(score)))]

print ("The text is probably in:", detected_language)

The text is probably in: english
