# Gaussian Mixture Model (EM Algo) for Language Classification

### Imports

In [95]:
from sklearn.mixture import GaussianMixture as GM
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
path = 'C:\\Users\\Bhuvan Kumar\\Desktop\\Pattern Recognition\\CSV Files\\'

In [101]:
lang_dict = {0:'Assam', 1:'Bengali', 2:'English', 3:'Gujarati', 4:'Hindi', 
             5:'Kannada', 6:'Malayalam', 7:'Marati', 8:'Odissa', 9:'Punjabi', 10:'Tamil', 11:'Telugu'}

In [10]:
languages = sorted(os.listdir(path))
print(languages)
languages_path = [path+i for i in languages]

['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'odi', 'pun', 'tam', 'tel']


### Data Processing (Reading data language wise)

In [30]:
def read_data(path):
    subpath = [path+'\\PB_train\\', path+'\\PB_test\\', path+'\\YT_test\\'] #UBUNTU CHANGE '\\' to '/'
    data = []
    for p in subpath:
        temp = []
        for files in os.listdir(p):
            df = pd.read_csv(p+files, delimiter=',', header=None, encoding='utf-16')
            temp.append(np.array(df))
        data.append(temp)

    return data[0], data[1], data[2]

In [32]:
def concat(data):
    concat_data = np.empty([0,39])
    for i in data:
        n = np.array(i)
        concat_data = np.concatenate((concat_data,n),axis=0)
    return concat_data

### Getting data for all classes

In [79]:
asm_train, asm_pbtest, asm_yttest = read_data(languages_path[0])
asm_train = concat(asm_train)

In [111]:
ben_train, ben_pbtest, ben_yttest = read_data(languages_path[1])
ben_train = concat(ben_train)

In [114]:
eng_train, eng_pbtest, eng_yttest = read_data(languages_path[2])
eng_train = concat(eng_train)

In [115]:
guj_train, guj_pbtest, guj_yttest = read_data(languages_path[3])
guj_train = concat(guj_train)

In [116]:
hin_train, hin_pbtest, hin_yttest = read_data(languages_path[4])
hin_train = concat(hin_train)

In [117]:
kan_train, kan_pbtest, kan_yttest = read_data(languages_path[5])
kan_train = concat(kan_train)

In [118]:
mal_train, mal_pbtest, mal_yttest = read_data(languages_path[6])
mal_train = concat(mal_train)

In [119]:
mar_train, mar_pbtest, mar_yttest = read_data(languages_path[7])
mar_train = concat(mar_train)

In [120]:
odi_train, odi_pbtest, odi_yttest = read_data(languages_path[8])
odi_train = concat(odi_train)

In [121]:
pun_train, pun_pbtest, pun_yttest = read_data(languages_path[9])
pun_train = concat(pun_train)

In [122]:
tam_train, tam_pbtest, tam_yttest = read_data(languages_path[10])
tam_train = concat(tam_train)

In [123]:
tel_train, tel_pbtest, tel_yttest = read_data(languages_path[11])
tel_train = concat(tel_train)

### Building GMM Classifier

#### Class1: Assam

In [61]:
asm = GM(n_components=1, init_params='kmeans', covariance_type='full')
asm.fit(asm_train)
print('WEIGHTS:', asm.weights_)
#print('MEAN:', asm.means_)
#print('COVARIANCE:', asm.covariances_)

WEIGHTS: [1.]


#### Class2: Bengali

In [62]:
ben = GM(n_components=1, init_params='kmeans', covariance_type='full')
ben.fit(ben_train)
print('WEIGHTS:', ben.weights_)

WEIGHTS: [1.]


#### Class3: English

In [63]:
eng = GM(n_components=1, init_params='kmeans', covariance_type='full')
eng.fit(eng_train)
print('WEIGHTS:', eng.weights_)

WEIGHTS: [1.]


#### Class4: Gujarati

In [64]:
guj = GM(n_components=1, init_params='kmeans', covariance_type='full')
guj.fit(guj_train)
print('WEIGHTS:', guj.weights_)

WEIGHTS: [1.]


#### Class5: Hindi

In [65]:
hin = GM(n_components=1, init_params='kmeans', covariance_type='full')
hin.fit(hin_train)
print('WEIGHTS:', hin.weights_)

WEIGHTS: [1.]


#### Class6: Kannada

In [66]:
kan = GM(n_components=1, init_params='kmeans', covariance_type='full')
kan.fit(kan_train)
print('WEIGHTS:', kan.weights_)

WEIGHTS: [1.]


#### Class7: Malayalam

In [67]:
mal = GM(n_components=1, init_params='kmeans', covariance_type='full')
mal.fit(mal_train)
print('WEIGHTS:', mal.weights_)

WEIGHTS: [1.]


#### Class8: Marathi

In [68]:
mar = GM(n_components=1, init_params='kmeans', covariance_type='full')
mar.fit(mar_train)
print('WEIGHTS:', mar.weights_)

WEIGHTS: [1.]


#### Class9: Odissa

In [69]:
odi = GM(n_components=1, init_params='kmeans', covariance_type='full')
odi.fit(odi_train)
print('WEIGHTS:', odi.weights_)

WEIGHTS: [1.]


#### Class10: Punjabi

In [70]:
pun = GM(n_components=1, init_params='kmeans', covariance_type='full')
pun.fit(pun_train)
print('WEIGHTS:', pun.weights_)

WEIGHTS: [1.]


#### Class11: Tamil

In [71]:
tam = GM(n_components=1, init_params='kmeans', covariance_type='full')
tam.fit(tam_train)
print('WEIGHTS:', tam.weights_)

WEIGHTS: [1.]


#### Class12: Telugu

In [72]:
tel = GM(n_components=1, init_params='kmeans', covariance_type='full')
tel.fit(tel_train)
print('WEIGHTS:', tel.weights_)

WEIGHTS: [1.]


### Testing

In [78]:
def detect(val):
    lang_dict = {0:'Assam', 1:'Bengali', 2:'English', 3:'Gujarati', 4:'Hindi', 
             5:'Kannada', 6:'Malayalam', 7:'Marati', 8:'Odissa', 9:'Punjabi', 10:'Tamil', 11:'Telugu'}
    return lang_dict[val]

In [76]:
def get_scores(lang, test):
    return np.mean(lang.score_samples(test))

In [135]:
def get_gt(c, n):
    return [c for i in range(n)]

In [108]:
def system_predict(dataset):
    res = []
    for i in dataset:
        lh = [get_scores(asm, i), get_scores(ben, i), get_scores(eng, i),
             get_scores(guj, i), get_scores(hin, i), get_scores(kan, i),
             get_scores(mal, i), get_scores(mar, i), get_scores(odi, i),
             get_scores(pun, i), get_scores(tam, i), get_scores(tel, i)]

        res.append(np.argmax(lh))

    #gt = np.zeros(len(res))
    #print('Most language detected as: ', detect(res[0]))
    return res

#### PB Test Data

In [124]:
predicted_asm = system_predict(asm_pbtest)
predicted_ben = system_predict(ben_pbtest)
predicted_eng = system_predict(eng_pbtest)
predicted_guj = system_predict(guj_pbtest)
predicted_hin = system_predict(hin_pbtest)
predicted_kan = system_predict(kan_pbtest)
predicted_mal = system_predict(mal_pbtest)
predicted_mar = system_predict(mar_pbtest)
predicted_odi = system_predict(odi_pbtest)
predicted_pun = system_predict(pun_pbtest)
predicted_tam = system_predict(tam_pbtest)
predicted_tel = system_predict(tel_pbtest)

In [136]:
gt_asm = get_gt(0, len(predicted_asm))
gt_ben = get_gt(1, len(predicted_ben))
gt_eng = get_gt(2, len(predicted_eng))
gt_guj = get_gt(3, len(predicted_guj))
gt_hin = get_gt(4, len(predicted_hin))
gt_kan = get_gt(5, len(predicted_kan))
gt_mal = get_gt(6, len(predicted_mal))
gt_mar = get_gt(7, len(predicted_mar))
gt_odi = get_gt(8, len(predicted_odi))
gt_pun = get_gt(9, len(predicted_pun))
gt_tam = get_gt(10, len(predicted_tam))
gt_tel = get_gt(11, len(predicted_tel))

In [137]:
predicted = predicted_asm + predicted_ben + predicted_eng + predicted_guj + predicted_hin + predicted_kan + predicted_mal + predicted_mar + predicted_odi + predicted_pun + predicted_tam + predicted_tel
gt = gt_asm + gt_ben + gt_eng + gt_guj + gt_hin + gt_kan + gt_mal + gt_mar + gt_odi + gt_pun + gt_tam + gt_tel

In [141]:
confusion_matrix = metrics.confusion_matrix(gt, predicted, labels=list(lang_dict.keys()))
print(confusion_matrix)

[[245   3  15   0   7   1   3   7  69   0   6   3]
 [ 16 143   0   1   2   0   7   0   8   0   0   2]
 [ 11   0  98   0   3   0   3   0   0   0   0   1]
 [  0   0   0 152   0   0   0   7   1   2   3  14]
 [  9   6  12  20 106   1   0  11   0   3   2   9]
 [  8   6  11  13   2 128   1   0   8   0   5  15]
 [ 16   3   3   4   4   1 137   0  20   0   3   5]
 [ 24   0   1   0   0   0   5  68   5   0   4  10]
 [  4   2   2   3   5   0   2   1 179   0   0   1]
 [  0   0   0  48  15   0   0   0   0  60   0   1]
 [  2   5   0   0   1   1   1   0   2   0  94  19]
 [  3   1  11  31   5   1   4   4   2   1   7 124]]


#### YT Test Data

In [142]:
predicted_asm = system_predict(asm_yttest)
predicted_ben = system_predict(ben_yttest)
predicted_eng = system_predict(eng_yttest)
predicted_guj = system_predict(guj_yttest)
predicted_hin = system_predict(hin_yttest)
predicted_kan = system_predict(kan_yttest)
predicted_mal = system_predict(mal_yttest)
predicted_mar = system_predict(mar_yttest)
predicted_odi = system_predict(odi_yttest)
predicted_pun = system_predict(pun_yttest)
predicted_tam = system_predict(tam_yttest)
predicted_tel = system_predict(tel_yttest)

In [143]:
gt_asm = get_gt(0, len(predicted_asm))
gt_ben = get_gt(1, len(predicted_ben))
gt_eng = get_gt(2, len(predicted_eng))
gt_guj = get_gt(3, len(predicted_guj))
gt_hin = get_gt(4, len(predicted_hin))
gt_kan = get_gt(5, len(predicted_kan))
gt_mal = get_gt(6, len(predicted_mal))
gt_mar = get_gt(7, len(predicted_mar))
gt_odi = get_gt(8, len(predicted_odi))
gt_pun = get_gt(9, len(predicted_pun))
gt_tam = get_gt(10, len(predicted_tam))
gt_tel = get_gt(11, len(predicted_tel))

In [144]:
predicted = predicted_asm + predicted_ben + predicted_eng + predicted_guj + predicted_hin + predicted_kan + predicted_mal + predicted_mar + predicted_odi + predicted_pun + predicted_tam + predicted_tel
gt = gt_asm + gt_ben + gt_eng + gt_guj + gt_hin + gt_kan + gt_mal + gt_mar + gt_odi + gt_pun + gt_tam + gt_tel

In [145]:
confusion_matrix = metrics.confusion_matrix(gt, predicted, labels=list(lang_dict.keys()))
print(confusion_matrix)

[[  9  27  19   0   0   1  36   0  79   0   5   4]
 [  0  14  42   0   2   1  47  15  36   0   0  23]
 [  5  43  12   0   0   3  38   0  19   0   0   6]
 [  0   1  15   0   6   3 113   7   5   0   9  22]
 [  6  13   0   1   2  10  81   0  42   0   0  26]
 [ 12  32  27   0   4   4  34  18  43   1   0   6]
 [  0  55   4  10   6   3  19  24  24   3  20  12]
 [  0  22   0   0   0   0  54   6  31   0   0   6]
 [  0  28  20   0   0   0  16  43  59   0   0  14]
 [  0   1   0   0   0   0  27  14  75   0   0   4]
 [  0  16   4  20   0   0  21   0  33  17   5   1]
 [  1  36   2  18   4   2  42  22  30   0  10  11]]
