In [1]:
import numpy as np
import json
from sklearn import preprocessing

In [2]:
TRAIN_DATA_RAW = "train_X_languages_homework.json.txt"
TRAIN_LABEL_RAW = "train_y_languages_homework.json.txt"

In [3]:
def read_data(path):
    sentences = list()
    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                sentence = json.loads(line)['text']
                sentences.append(sentence)
        return sentences

In [4]:
def read_label(path):
    classifications = list()
    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                classification = json.loads(line)['classification']
                classifications.append(classification)
        return classifications

In [5]:
dataset = read_data(path = TRAIN_DATA_RAW)

In [6]:
labelset = read_label(path = TRAIN_LABEL_RAW)

In [7]:
dataset[0:30]

['Lacus class fames',
 'στην Πλατεία του',
 'מפני שטעו לחשוב',
 '• Florida •',
 '. Siehe auch : Liste der Landschaften in Nordrhein-Westfalen',
 'Сицзян ) расположен',
 '. Deshalb wird',
 'Uí Fiachrach · Uí Briúin · Uí Néill · Síl',
 'v srednjeveški latinščini',
 'menjadi embrio .',
 'prevalentemente su testo',
 'Anversa 1920 ·',
 ' Albania  •  Andora  •  Armenia  •  Austria  •  Azerbejdżan  •  Belgia  •  Bośnia i Hercegowina  •  Bułgaria  •  Chorwacja  •  Cypr  •  Czarnogóra  •  Czechy  •  Dania  •  Estonia  •  Finlandia  •  Francja  •  Grecja  •  Gruzja  •  Hiszpania  •  Holandia  •  Irlandia  •  Islandia  •  Liechtenstein  •  Litwa  •  Luksemburg  •  Łotwa  •  Macedonia  •  Malta  •  Mołdawia  •  Monako  •  Niemcy  •  Norwegia  •  Polska  •  Portugalia  •  Rosja  •  Rumunia  •  San Marino  •  Serbia  •  Słowacja  •  Słowenia  •  Szwajcaria  •  Szwecja  •  Turcja  •  Ukraina  •  Węgry  •  Wielka Brytania  •  Włochy ',
 'acima , os',
 'Krst pri Savici',
 'čeprav so mu domači in strici

In [8]:
labelset[0:30]

['lorem',
 'el',
 'he',
 'da',
 'de',
 'ru',
 'de',
 'hu',
 'sl',
 'id',
 'it',
 'it',
 'pl',
 'pt',
 'sl',
 'sl',
 'no',
 'ru',
 'eu',
 'ca',
 'id',
 'es',
 'be',
 'gl',
 'sh',
 'tr',
 'lt',
 'ro',
 'az',
 'ko']

In [9]:
def count_language(labelset):
    label_count = dict()
    for label in labelset:
        if label in label_count:
            label_count[label] += 1
        else:
            label_count[label] = 1
    return label_count
    
    

In [10]:
label_count = count_language(labelset = labelset)

In [11]:
len(label_count.keys())

56

In [12]:
print(label_count)

{'lorem': 3366, 'el': 2765, 'he': 2306, 'da': 2982, 'de': 4697, 'ru': 4045, 'hu': 2734, 'sl': 1569, 'id': 1676, 'it': 4836, 'pl': 2053, 'pt': 2712, 'no': 2384, 'eu': 1774, 'ca': 3013, 'es': 5198, 'be': 2015, 'gl': 2141, 'sh': 2652, 'tr': 2510, 'lt': 1570, 'ro': 3014, 'az': 1920, 'ko': 1234, 'fr': 3979, 'en': 4531, 'vi': 3261, 'hr': 1867, 'ar': 4143, 'bg': 2774, 'hi': 2146, 'ja': 1728, 'nn': 1393, 'sk': 1079, 'uz': 889, 'ka': 1167, 'fa': 1984, 'cs': 2332, 'zh': 1690, 'sv': 2600, 'nl': 2465, 'kk': 1120, 'eo': 2010, 'ur': 913, 'et': 923, 'uk': 3430, 'hy': 2263, 'ceb': 307, 'fi': 1598, 'th': 1578, 'sr': 2027, 'la': 1255, 'vo': 412, 'ms': 1238, 'ce': 576, 'war': 547}


In [13]:
def count_char_utf(dataset):
    char_count = dict()
    for data in dataset:
        for char in data:
            if char in char_count:
                char_count[char] += 1
            else:
                char_count[char] = 1
    return char_count
            

In [14]:
char_count = count_char_utf(dataset = dataset)

In [15]:
len(char_count.keys())

5697

In [16]:
def extract_n_gram(dataset, labelset, language, n = 2, k = 200):
    # extract the top k frequent n-grams for certain language
    # count all n-grams
    n_gram_count = dict()
    for data, label in zip(dataset, labelset):
        if label == language:
            # print(label)
            length = len(data)
            for i in range(length-n+1):
                n_gram = data[i:i+n]
                #print(n_gram)
                if n_gram in n_gram_count:
                    n_gram_count[n_gram] += 1
                else:
                    n_gram_count[n_gram] = 1
    # extract the top k frequent n-grams from all the n-grams
    n_gram_count_tuple = list()
    for n_gram in n_gram_count:
        n_gram_count_tuple.append((n_gram, n_gram_count[n_gram]))
    n_gram_count_tuple.sort(key = lambda tup: tup[1], reverse = True)
    number_n_gram = len(n_gram_count_tuple)
    n_gram_top_k = list()
    n_gram_top_k_occurrence = list()
    for i in range(min(k, number_n_gram)):
        n_gram_top_k.append(n_gram_count_tuple[i][0])
        n_gram_top_k_occurrence.append(n_gram_count_tuple[i][1])
    return n_gram_top_k, n_gram_top_k_occurrence


In [17]:
n_gram_el, n_gram_el_occurence = extract_n_gram(dataset = dataset, labelset = labelset, language = 'el', n = 2, k = 500)

In [18]:
def extract_n_gram_all(dataset, labelset, languages, n = 2, k = 200):
    # extract the top k frequent n-grams for all languages
    # make them into on n_gram list
    n_gram_list = list()
    n_gram_occurrence = list()
    for language in languages:
        n_gram_top_k, n_gram_top_k_occurrence = extract_n_gram(
            dataset = dataset, labelset = labelset, language = language, n = n, k = k)
        n_gram_list += n_gram_top_k
        
    n_gram_list = list(set(n_gram_list))
    
    return n_gram_list
        
        

In [19]:
len(list(label_count.keys()))

56

In [20]:
two_gram_list = extract_n_gram_all(
    dataset = dataset, labelset = labelset, languages = list(label_count.keys()), n = 2, k = 100)

In [21]:
len(list(two_gram_list))

1660

In [22]:
def n_gram_representation(sentence, n, n_gram_list):
    sentence_n_grams = list()
    length = len(sentence)
    
    for i in range(length-n+1):
        #print(sentence[i:i+n])
        sentence_n_grams.append(sentence[i:i+n])
    sentence_n_grams = set(sentence_n_grams)
    
    
    num_n_grams_all = len(n_gram_list)

    
    representation = np.zeros(num_n_grams_all)
    
    for i in range(num_n_grams_all):
        if n_gram_list[i] in sentence_n_grams:
            
            representation[i] = 1.0
    
    
    return representation

    

In [23]:
a  = n_gram_representation(sentence = "how ?", n = 2, n_gram_list = two_gram_list)

In [24]:
def feature_index_dict(n_gram_list):
    index_dict = dict()
    number_n_grams = len(n_gram_list)
    for i in range(number_n_grams):
        index_dict[n_gram_list[i]] = i
        #print(i)
    return index_dict

In [25]:
two_gram_index_dict = feature_index_dict(n_gram_list  = two_gram_list)

In [26]:
len(two_gram_index_dict)

1660

In [27]:
def prepare_n_gram_dataset(dataset, n, n_gram_list):
    n_gram_dataset = np.zeros((len(dataset), len(n_gram_list)))
    size_dataset = len(dataset)
    for i in range(size_dataset):
        #print(i)
        sentence = dataset[i]
        n_gram_dataset[i] = n_gram_representation(sentence, n, n_gram_list)
        
    return n_gram_dataset

In [28]:
'''
def prepare_n_gram_dataset(dataset, n, n_gram_index_dict):
    n_gram_dataset = np.zeros((len(dataset), len(n_gram_index_dict)))
    size_dataset = len(dataset)
    for i in range(size_dataset):
        #print(i)
        sentence = dataset[i]
        length = len(sentence)
        for j in range(length-n+1):
            #print(sentence[j:j+n])
            if sentence[j:j+n] in n_gram_index_dict:
                
                n_gram_dataset[i,n_gram_index_dict[sentence[j:j+n]]] = 1.0
        
    return n_gram_dataset
'''

'\ndef prepare_n_gram_dataset(dataset, n, n_gram_index_dict):\n    n_gram_dataset = np.zeros((len(dataset), len(n_gram_index_dict)))\n    size_dataset = len(dataset)\n    for i in range(size_dataset):\n        #print(i)\n        sentence = dataset[i]\n        length = len(sentence)\n        for j in range(length-n+1):\n            #print(sentence[j:j+n])\n            if sentence[j:j+n] in n_gram_index_dict:\n                \n                n_gram_dataset[i,n_gram_index_dict[sentence[j:j+n]]] = 1.0\n        \n    return n_gram_dataset\n'

In [29]:
dataset_train = prepare_n_gram_dataset(dataset = dataset, n = 2, n_gram_list = two_gram_list)

In [30]:
lb = preprocessing.LabelBinarizer()

In [31]:
lb.fit(labelset)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [32]:
lb.classes_

array(['ar', 'az', 'be', 'bg', 'ca', 'ce', 'ceb', 'cs', 'da', 'de', 'el',
       'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hi',
       'hr', 'hu', 'hy', 'id', 'it', 'ja', 'ka', 'kk', 'ko', 'la', 'lorem',
       'lt', 'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sh', 'sk',
       'sl', 'sr', 'sv', 'th', 'tr', 'uk', 'ur', 'uz', 'vi', 'vo', 'war',
       'zh'],
      dtype='<U5')

In [33]:
labelset_onehot = lb.transform(labelset)

In [39]:
labelset_onehot[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:

lb = preprocessing.LabelBinarizer()
lb.fit(labelset)

lb.classes_

lb.transform([1, 6])

In [158]:
np.sum(dataset_train_1[200])

14.0

In [159]:
np.sum(dataset_train_2[200])

14.0

In [57]:
c = np.array([np.array([1,2]), np.array([3,4])])

In [58]:
c

array([[1, 2],
       [3, 4]])

In [59]:
c[1] = np.array([5,6])

In [60]:
c

array([[1, 2],
       [5, 6]])

In [39]:
a = np.array([[]])

In [40]:
a = np.append(a, np.array([[1,2,3]]))

In [41]:
a

array([ 1.,  2.,  3.])

In [43]:
np.append(a,np.array([[1,2,3]]))

array([ 1.,  2.,  3.,  1.,  2.,  3.])

In [25]:
np.ones((100000, 2000)).nbytes

1600000000