# Naive Bayes Classifier

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import nltk
import csv
import random

In [2]:
class WordClassifier:

    def get_features(self):
        names = self._load_names()
        transliterables = names['transliterable']
        nontransliterables = names['nontransliterable']
        featureset = list()

        for trans in transliterables:
            features = self._name_features(trans)
            featureset.append((features, 'transliterable'))

        for nontrans in nontransliterables:
            features = self._name_features(nontrans)
            featureset.append((features, 'nontransliterable'))

        return featureset

    def train_and_test(self, training_percent=0.90):
        featureset = self.get_features()
        print('[1] Саралов ...')
        random.shuffle(featureset)

        name_count = len(featureset)
        cut_point = int(name_count * training_percent)

        train_set = featureset[:cut_point]
        test_set = featureset[cut_point:]

        train_set = featureset
        test_set = featureset

        print('[2] Тайёрлов ...')
        self.train(train_set)

        print('[3] Якуний синов ...')
        return self.test(test_set)

    def classify(self, name):
        feats = self._name_features(name)
        return self.classifier.classify(feats)

    def train(self, train_set):
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        return self.classifier

    def test(self, test_set):
        return nltk.classify.accuracy(self.classifier, test_set)

    def _load_names(self):
        names = dict()
        names['transliterable'] = list()
        names['nontransliterable'] = list()
        with open('wordlist.csv') as csvfile:
            csvreader = csv.reader(csvfile)
            for row in csvreader:
                names[row[0]].append(row[1])

        return names

    def _get_prob_distr(self, name_tuple):
        trans_prob = (name_tuple[1] * 1.0) / (name_tuple[1] + name_tuple[2])

        if trans_prob == 1.0:
            trans_prob = 0.99
        elif trans_prob == 0.0:
            trans_prob = 0.01
        else:
            pass

        nontrans_prob = 1.0 - nontrans_prob
        return (trans_prob, nontrans_prob)

    def get_most_informative_features(self, n=5):
        return self.classifier.most_informative_features(n)

    def _name_features(self, name):
        name = name.lower()
        return {
            'last_letter': name[-1],
            'last_two': name[-2:],
            'last_three': name[-3:],
            'last_four': (lambda: name[-4:] if len(name) >= 4 else name)(),
            'first_two': name[:2],
            'first_three': name[:3],
            'first_four': (lambda: name[:-4] if len(name) >= 4 else name)(),
            'last_is_vowel': (name[-1] in 'aeiou')
        }

In [3]:
wc = WordClassifier()
accuracy = wc.train_and_test()
print('Аниқлик даражаси: ', round(accuracy, 2))

_ = {
    'transliterable': 'ўгирилиши керак',
    'nontransliterable': 'ўгириш керак эмас'
}

[1] Саралов ...
[2] Тайёрлов ...
[3] Якуний синов ...
Аниқлик даражаси:  0.98


In [4]:
word = 'daryo.uz'
decision = _[wc.classify(word)]

print('"{word}" сўзини кириллчага {decision}'.format(word=word, decision=decision))

"daryo.uz" сўзини кириллчага ўгириш керак эмас


In [5]:
sentence = '''
Google yuqori tariflar bois Xitoydagi ishlab chiqarishlarini qisqartirmoqda
'''

In [6]:
for word in sentence.split():
    decision = _[wc.classify(word.lower())]
    print('"{word}" сўзини кириллчага {decision}'.format(word=word, decision=decision))

"Google" сўзини кириллчага ўгириш керак эмас
"yuqori" сўзини кириллчага ўгирилиши керак
"tariflar" сўзини кириллчага ўгирилиши керак
"bois" сўзини кириллчага ўгириш керак эмас
"Xitoydagi" сўзини кириллчага ўгирилиши керак
"ishlab" сўзини кириллчага ўгирилиши керак
"chiqarishlarini" сўзини кириллчага ўгирилиши керак
"qisqartirmoqda" сўзини кириллчага ўгирилиши керак
