# Naive Bayes Classifier

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import nltk
import csv
import random

In [2]:
class WordClassifier:

    def get_features(self):
        words = self._load_words()
        transliterables = words['transliterable']
        nontransliterables = words['nontransliterable']
        featureset = list()

        for trans in transliterables:
            features = self._word_features(trans)
            featureset.append((features, 'transliterable'))

        for nontrans in nontransliterables:
            features = self._word_features(nontrans)
            featureset.append((features, 'nontransliterable'))

        return featureset

    def train_and_test(self, training_percent=0.90):
        featureset = self.get_features()
        print('[1] Саралов ...')
        random.shuffle(featureset)

        word_count = len(featureset)
        cut_point = int(word_count * training_percent)

        train_set = featureset[:cut_point]
        test_set = featureset[cut_point:]

        train_set = featureset
        test_set = featureset

        print('[2] Тайёрлов ...')
        self.train(train_set)

        print('[3] Якуний синов ...')
        return self.test(test_set)

    def classify(self, word):
        feats = self._word_features(word)
        return self.classifier.classify(feats)

    def train(self, train_set):
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        return self.classifier

    def test(self, test_set):
        return nltk.classify.accuracy(self.classifier, test_set)

    def _load_words(self):
        words = dict()
        words['transliterable'] = list()
        words['nontransliterable'] = list()
        with open('wordlist.csv') as csvfile:
            csvreader = csv.reader(csvfile)
            for row in csvreader:
                words[row[0]].append(row[1])

        return words

    def _get_prob_distr(self, word_tuple):
        trans_prob = (word_tuple[1] * 1.0) / (word_tuple[1] + word_tuple[2])

        if trans_prob == 1.0:
            trans_prob = 0.99
        elif trans_prob == 0.0:
            trans_prob = 0.01
        else:
            pass

        nontrans_prob = 1.0 - nontrans_prob
        return (trans_prob, nontrans_prob)

    def get_most_informative_features(self, n=5):
        return self.classifier.most_informative_features(n)

    def _word_features(self, word):
        word = word.lower()
        return {
            'last_letter': word[-1],
            'last_two': word[-2:],
            'last_three': word[-3:],
            'last_four': (lambda: word[-4:] if len(word) >= 4 else word)(),
            'first_two': word[:2],
            'first_three': word[:3],
            'first_four': (lambda: word[:-4] if len(word) >= 4 else word)(),
            'last_is_vowel': (word[-1] in 'aeiou')
        }

In [3]:
wc = WordClassifier()
accuracy = wc.train_and_test()
print('Аниқлик даражаси: ', round(accuracy, 4))

_ = {
    'transliterable': 'ўгирилиши шарт',
    'nontransliterable': 'ўгирилиши шартмас'
}

[1] Саралов ...
[2] Тайёрлов ...
[3] Якуний синов ...
Аниқлик даражаси:  0.9825


In [4]:
word = 'daryo.uz'
decision = _[wc.classify(word)]

print('"{word}" сўзи кириллчага {decision}'.format(word=word, decision=decision))

"daryo.uz" сўзи кириллчага ўгирилиши шартмас


In [5]:
sentence = '''
Google yuqori tariflar bois Xitoydagi ishlab chiqarishlarini qisqartirmoqda. 
'''

In [6]:
sentence = sentence.replace(',', ' ').replace('.', ' ')
for word in sentence.split():
    word = word.lower()
    decision = _[wc.classify(word)]
    print('"{word}" сўзини кириллчага {decision}'.format(word=word, decision=decision))

"google" сўзини кириллчага ўгирилиши шартмас
"yuqori" сўзини кириллчага ўгирилиши шарт
"tariflar" сўзини кириллчага ўгирилиши шарт
"bois" сўзини кириллчага ўгирилиши шартмас
"xitoydagi" сўзини кириллчага ўгирилиши шарт
"ishlab" сўзини кириллчага ўгирилиши шарт
"chiqarishlarini" сўзини кириллчага ўгирилиши шарт
"qisqartirmoqda" сўзини кириллчага ўгирилиши шарт
