In [23]:
import re
import math
from hazm import *
import numpy as np
import pandas as pd
from sklearn import svm
from string import punctuation
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()

In [3]:
def load_datatset():
    dataset = pd.read_csv('dataset/ProjectData.csv')
    dataset = dataset[['comment','label']]
    dataset = dataset[dataset['label'] != -2]
    dataset = dataset.dropna()
    return dataset

In [4]:
dataset = load_datatset()
dataset['label'] += 2
dataset

Unnamed: 0,comment,label
0,گردن بند خوبو قشنگیه خوبم جلوه میکنه و خودشو ن...,3.0
1,به نظر من اصلا خوب نبود! به جاش با روغن زیتون ...,1.0
2,من خریدم مبلم رو بی ریخت کرد و زود پاره شد,1.0
3,حتما پیشنهاد میکنم,3.0
4,در کل عالی,3.0
...,...,...
62933,کواد کوپتر پرواز دادنش خیلی لذت بخش هست به شر...,3.0
62934,سلاممن ازش خیلس راضی هستم شارژ 10000 واقعی خیل...,3.0
62935,این فیلتر رو تا بحال دو بار نصب و تعویض کردم. ...,3.0
62936,ضد آفتاب مناسبی برای پوست چرب با رنگ‌خوب، البت...,3.0


In [5]:
counts = dataset['label'].value_counts()
counts

3.0    40149
1.0    17044
2.0     5627
Name: label, dtype: int64

In [6]:
def tokenize_text(text):
    text = normalizer.normalize(text)
    text = text.replace('.', ' ')
    text = re.sub('\s+', ' ', text).strip()
    text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '').replace('ي', 'ی').replace('ك', 'ک')
    tokens = word_tokenize(text)
    return tokens

In [7]:
def create_word_set(comments):
    word_set = set()
    for comment in comments:
        for token in comment:
            word_set.add(token)
    return word_set

In [8]:
def preprocessing(comment):
    stop_words = ['و', 'در', 'به', 'از', 'که', 'این', 'را', 'با', 'است', 'برای', 'آن', 'یک', 'خود', 'تا', 'کرد', 'بر', 'هم', 'نیز', 'گفت', 'می\u200cشود', 'وی', 'شد', 'دارد', 'ما', 'اما', 'یا', 'شده', 'باید', 'هر', 'آنها', 'بود', 'او', 'دیگر', 'دو', 'مورد', 'می\u200cکند', 'شود', 'کند', 'وجود', 'بین', 'پیش', 'شده_است', 'پس', 'نظر', 'اگر', 'همه', 'یکی', 'حال', 'هستند', 'من', 'کنند', 'نیست', 'باشد', 'چه', 'بی', 'می', 'بخش', 'می\u200cکنند', 'همین', 'افزود', 'هایی', 'دارند', 'راه', 'همچنین', 'روی', 'داد', 'سه', 'داشت', 'چند', 'سوی', 'تنها', 'هیچ', 'میان', 'اینکه', 'شدن', 'بعد', 'جدید', 'ولی', 'حتی', 'کردن', 'برخی', 'کردند', 'می\u200cدهد', 'اول', 'نه', 'کرده_است', 'نسبت', 'بیش', 'شما', 'چنین', 'طور', 'افراد', 'تمام', 'درباره', 'بار', 'بسیاری', 'می\u200cتواند', 'کرده', 'چون', 'ندارد', 'دوم', 'بزرگ', 'طی', 'حدود', 'همان', 'بدون', 'البته', 'آنان', 'می\u200cگوید', 'دیگری', 'خواهد_شد', 'کنیم', 'قابل', 'یعنی', 'رشد', 'می\u200cتوان', 'وارد', 'کل', 'ویژه', 'قبل', 'براساس', 'نیاز', 'گذاری', 'هنوز', 'لازم', 'سازی', 'بوده_است', 'چرا', 'می\u200cشوند', 'وقتی', 'گرفت', 'کم', 'جای', 'حالی', 'تغییر', 'پیدا', 'اکنون', 'تحت', 'باعث', 'مدت', 'فقط', 'تعداد', 'آیا', 'بیان', 'رو', 'شدند', 'عدم', 'کرده_اند', 'بودن', 'نوع', 'بلکه', 'جاری', 'دهد', 'برابر', 'مهم', 'بوده', 'اخیر', 'مربوط', 'امر', 'زیر', 'گیری', 'شاید', 'خصوص', 'آقای', 'اثر', 'کننده', 'بودند', 'فکر', 'کنار', 'اولین', 'سوم', 'سایر', 'کنید', 'ضمن', 'مانند', 'باز', 'می\u200cگیرد', 'ممکن', 'حل', 'دارای', 'پی', 'مثل', 'می\u200cرسد', 'اجرا', 'دور', 'منظور', 'کسی', 'موجب', 'طول', 'امکان', 'آنچه', 'تعیین', 'گفته', 'شوند', 'جمع', 'علاوه', 'گونه', 'تاکنون', 'رسید', 'ساله', 'گرفته', 'شده_اند', 'علت', 'چهار', 'داشته_باشد', 'خواهد_بود', 'طرف', 'تهیه', 'تبدیل', 'مناسب', 'زیرا', 'مشخص', 'می\u200cتوانند', 'نزدیک', 'جریان', 'روند', 'بنابراین', 'می\u200cدهند', 'یافت', 'نخستین', 'بالا', 'پنج', 'ریزی', 'چیزی', 'نخست', 'بیشتری', 'ترتیب', 'شده_بود', 'خاص', 'شروع', 'فرد', 'کامل', 'غیر', 'می\u200cرود', 'دهند', 'آخرین', 'دادن', 'جدی', 'بهترین', 'شامل', 'گیرد', 'بخشی', 'باشند', 'تمامی', 'بهتر', 'داده_است', 'حد', 'نبود', 'کسانی', 'می\u200cکرد', 'داریم', 'علیه', 'می\u200cباشد', 'دانست', 'ناشی', 'داشتند', 'دهه', 'می\u200cشد', 'ایشان', 'آنجا', 'گرفته_است', 'دچار', 'می\u200cآید', 'لحاظ', 'آنکه', 'داده', 'بعضی', 'هستیم', 'اند', 'برداری', 'نباید', 'می\u200cکنیم', 'نشست', 'سهم', 'همیشه', 'آمد', 'اش', 'وگو', 'می\u200cکنم', 'حداقل', 'طبق', 'جا', 'خواهد_کرد', 'نوعی', 'چگونه', 'رفت', 'هنگام', 'فوق', 'روش', 'ندارند', 'سعی', 'بندی', 'شمار', 'کلی', 'کافی', 'مواجه', 'همچنان', 'سمت', 'کوچک', 'داشته_است', 'چیز', 'پشت', 'آورد', 'حالا', 'روبه', 'سال\u200cهای', 'دادند', 'می\u200cکردند', 'عهده', 'نیمه', 'جایی', 'دیگران', 'سی', 'بروز', 'یکدیگر', 'آمده_است', 'جز', 'کنم', 'سپس', 'کنندگان', 'خودش', 'همواره', 'یافته', 'شان', 'صرف', 'نمی\u200cشود', 'رسیدن', 'چهارم', 'یابد', 'متر', 'ساز', 'داشته', 'کرده_بود', 'باره', 'نحوه', 'کردم', 'تو', 'شخصی', 'داشته_باشند', 'محسوب', 'پخش', 'کمی', 'متفاوت', 'سراسر', 'کاملا', 'داشتن', 'نظیر', 'آمده', 'گروهی', 'فردی', 'ع', 'همچون', 'خطر', 'خویش', 'کدام', 'دسته', 'سبب', 'عین', 'آوری', 'متاسفانه', 'بیرون', 'دار', 'ابتدا', 'شش', 'افرادی', 'می\u200cگویند', 'سالهای', 'درون', 'نیستند', 'یافته_است', 'پر', 'خاطرنشان', 'گاه', 'جمعی', 'اغلب', 'دوباره', 'می\u200cیابد', 'لذا', 'زاده', 'گردد', 'اینجا']
    REPLACE_NO_SPACE = re.compile("[.`;:!\'?,\"()\[\]،؛ًٌٍَُِّ]")
    REPLACE_NUMBER_ENGLISH = re.compile("[0-9A_Za-z۰-۹]")
    comment = [REPLACE_NO_SPACE.sub("", line) for line in comment]
    comment = [REPLACE_NUMBER_ENGLISH.sub("", line) for line in comment]
    comment = ''.join(c for c in comment if not c.isdigit())
    comment = ''.join(c for c in comment if c not in punctuation)
    comment = normalizer.normalize(comment)
    tokens = word_tokenize(comment)
    cleared_text = []
    for word in tokens:
        word = normalizer.normalize(word)
        word = lemmatizer.lemmatize(word)
        if word not in stop_words and len(word) > 1:
            cleared_text.append(word)
    return cleared_text

In [9]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [10]:
tokens = []
for i in range(len(X)):
    tokens.append(preprocessing((X[i])))

In [11]:
def DF(tokens):
    word_counts = Counter(word for feature in tokens for word in set(feature))
    total_comments = len(tokens)
    upper_threshold = total_comments * 0.8
    lower_threshold = total_comments * 0.001
    pruned_tokens_features = []
    for feature in tokens:
        pruned_feature = [word for word in feature if word_counts[word] < upper_threshold and word_counts[word] >= lower_threshold]
        pruned_tokens_features.append(pruned_feature)
    return pruned_tokens_features

In [12]:
tokens = DF(tokens)

In [13]:
def TF_IDF(tokens):
    tf = []
    for doc in tokens:
        doc_tf = {}
        for word in doc:
            doc_tf[word] = doc.count(word) / len(doc)
        tf.append(doc_tf)
    idf = {}
    for doc in tokens:
        for word in set(doc):
            if word in idf:
                idf[word] += 1
            else:
                idf[word] = 1
    num_docs = len(tokens)
    for word in idf:
        idf[word] = math.log((1 + num_docs) / (1 + idf[word])) + 1
    tfidf = []
    for doc in tf:
        doc_tfidf = {}
        for word in doc:
            doc_tfidf[word] = doc[word] * idf[word]
        tfidf.append(doc_tfidf)
    
    # Normalize the TF-IDF score for each word in each document
    for i in range(len(tfidf)):
        tfidf_values = list(tfidf[i].values())
        norm = math.sqrt(sum(x**2 for x in tfidf_values))
        for word in tfidf[i]:
            tfidf[i][word] /= norm  
    vocab = sorted(set(word for doc in tokens for word in doc))
    matrix = [[doc.get(word, 0) for word in vocab] for doc in tfidf]
    matrix = np.array(matrix)
    return matrix

In [14]:
matrix = TF_IDF(tokens)
matrix.shape

(62820, 1760)

In [15]:
X_new = SelectKBest(chi2, k=300).fit_transform(matrix, y)

In [55]:
# Logistic Regression Functions
def sigmoid(z):
    h = 1 / (1 + np.exp(-z))
    return h

def logistic_prediction(X, theta, threshold=0.5, norm=False):
    z = np.dot(X, theta.T)
    h = sigmoid(z)
    if not norm:
        return h
    else:
        if threshold == 0.5:
            yh = np.array([1 if label >= threshold else 0 for label in h]).reshape(-1, 1)
        else:
            yh = np.array([1 if label >= threshold else 0 for label in h]).reshape(-1, 1)
        return yh

def logistic_gradient(X, y, alpha, n_iter):
    m_sample, n_feature = X.shape
    theta = np.random.rand(n_feature).reshape(-1, n_feature)
    iter_cost = []
    for i in range(n_iter):
        pred = logistic_prediction(X, theta)
        change = []
        for j in range(n_feature):
            change.append((np.dot(X[:, j], (pred - y))) / m_sample)
            theta[0][j] = theta[0][j] - alpha * change[j]
        cost = abs(sum(change))
        iter_cost.append(cost)
    return theta[0], np.array(iter_cost)

def calc_cross_entropy(X, y, theta):
    m_sample = X.shape[0]
    ones = np.ones(m_sample)
    h = logistic_prediction(X, theta)
    ce = -(np.dot(y.T, np.log(h)) + np.dot((ones - y).T, np.log(ones - h))) / m_sample
    return ce[0]

In [57]:
# Our Logistic Regression

# Preparing Data
X_1v2 = X_new[np.where(y != 3)]
X_1v3 = X_new[np.where(y != 2)]
X_2v3 = X_new[np.where(y != 1)]

y_1v2 = y[y != 3]
y_1v3 = y[y != 2]
y_2v3 = y[y != 1]

y_1v2 = np.array([0 if y == 1 else 1 for y in y_1v2]).reshape(-1, 1)
y_1v3 = np.array([0 if y == 1 else 1 for y in y_1v3]).reshape(-1, 1)
y_2v3 = np.array([0 if y == 2 else 1 for y in y_2v3]).reshape(-1, 1)

X_train_1v2, X_test_1v2, y_train_1v2, y_test_1v2 = train_test_split(X_1v2, y_1v2, test_size=0.2, random_state=42)
X_train_1v3, X_test_1v3, y_train_1v3, y_test_1v3 = train_test_split(X_1v3, y_1v3, test_size=0.2, random_state=42)
X_train_2v3, X_test_2v3, y_train_2v3, y_test_2v3 = train_test_split(X_2v3, y_2v3, test_size=0.2, random_state=42)

# Train Phase Of 1 VS 2
theta_1v2, cost_1v2 = logistic_gradient(X_train_1v2, y_train_1v2, 0.1, 10000)

# Test Phase Of 1 VS 2
yh_train_1v2 = logistic_prediction(X_train_1v2, theta_1v2, norm=True)
yh_test_1v2 = logistic_prediction(X_test_1v2, theta_1v2, norm=True)

train_ce_1v2 = calc_cross_entropy(X_train_1v2, y_train_1v2, theta_1v2)
test_ce_1v2 = calc_cross_entropy(X_test_1v2, y_test_1v2, theta_1v2)

train_acc_1v2 = accuracy_score(y_train_1v2, yh_train_1v2)
test_acc_1v2 = accuracy_score(y_test_1v2, yh_test_1v2)

# Results For 1 VS 2
print('1 VS 2:')
print(f'Train Cross Entropy: {train_ce_1v2.round(2)} | Test Cross Entropy: {test_ce_1v2.round(2)}')
print(f'Train Accuracy: {round(train_acc_1v2 * 100, 2)} | Test Accuracy: {round(test_acc_1v2 * 100, 2)}\n')

# Train Phase Of 1 VS 3
theta_1v3, cost_1v3 = logistic_gradient(X_train_1v3, y_train_1v3, 0.1, 10000)

# Test Phase Of 1 VS 3
yh_train_1v3 = logistic_prediction(X_train_1v3, theta_1v3, norm=True)
yh_test_1v3 = logistic_prediction(X_test_1v3, theta_1v3, norm=True)

train_ce_1v3 = calc_cross_entropy(X_train_1v3, y_train_1v3, theta_1v3)
test_ce_1v3 = calc_cross_entropy(X_test_1v3, y_test_1v3, theta_1v3)

train_acc_1v3 = accuracy_score(y_train_1v3, yh_train_1v3)
test_acc_1v3 = accuracy_score(y_test_1v3, yh_test_1v3)

# Results For 1 VS 3
print('1 VS 3:')
print(f'Train Cross Entropy: {train_ce_1v3.round(2)} | Test Cross Entropy: {test_ce_1v3.round(2)}')
print(f'Train Accuracy: {round(train_acc_1v3 * 100, 2)} | Test Accuracy: {round(test_acc_1v3 * 100, 2)}\n')

# Train Phase Of 2 VS 3
theta_2v3, cost_2v3 = logistic_gradient(X_train_2v3, y_train_2v3, 0.1, 10000)

# Test Phase Of 2 VS 3
yh_train_2v3 = logistic_prediction(X_train_2v3, theta_2v3, norm=True)
yh_test_2v3 = logistic_prediction(X_test_2v3, theta_2v3, norm=True)

train_ce_2v3 = calc_cross_entropy(X_train_2v3, y_train_2v3, theta_2v3)
test_ce_2v3 = calc_cross_entropy(X_test_2v3, y_test_2v3, theta_2v3)

train_acc_2v3 = accuracy_score(y_train_2v3, yh_train_2v3)
test_acc_2v3 = accuracy_score(y_test_2v3, yh_test_2v3)

# Results For 2 VS 3
print('2 VS 3:')
print(f'Train Cross Entropy: {train_ce_2v3.round(2)} | Test Cross Entropy: {test_ce_2v3.round(2)}')
print(f'Train Accuracy: {round(train_acc_2v3 * 100, 2)} | Test Accuracy: {round(test_acc_2v3 * 100, 2)}\n')

1 VS 2:
Train Cross Entropy: 0.49 | Test Cross Entropy: 0.51
Train Accuracy: 75.83 | Test Accuracy: 74.42

1 VS 3:
Train Cross Entropy: 0.41 | Test Cross Entropy: 0.42
Train Accuracy: 81.44 | Test Accuracy: 80.3

2 VS 3:
Train Cross Entropy: 0.34 | Test Cross Entropy: 0.34
Train Accuracy: 87.73 | Test Accuracy: 88.05



In [59]:
# Train Confusion Matrix
print(f'1V2 Train Confusion Matrix:\n {confusion_matrix(y_train_1v2, yh_train_1v2)}\n')
print(f'1V3 Train Confusion Matrix:\n {confusion_matrix(y_train_1v3, yh_train_1v3)}\n')
print(f'2V3 Train Confusion Matrix:\n {confusion_matrix(y_train_2v3, yh_train_2v3)}\n')

# Test Confusion Matrix
print(f'1V2 Test Confusion Matrix:\n {confusion_matrix(y_test_1v2, yh_test_1v2)}\n')
print(f'1V3 Test Confusion Matrix:\n {confusion_matrix(y_test_1v3, yh_test_1v3)}\n')
print(f'2V3 Test Confusion Matrix:\n {confusion_matrix(y_test_2v3, yh_test_2v3)}\n')

1V2 Train Confusion Matrix:
 [[12649  1029]
 [ 3355  1103]]

1V3 Train Confusion Matrix:
 [[ 6240  7316]
 [ 1177 31021]]

2V3 Train Confusion Matrix:
 [[   66  4461]
 [   33 32060]]

1V2 Test Confusion Matrix:
 [[3097  269]
 [ 891  278]]

1V3 Test Confusion Matrix:
 [[1531 1957]
 [ 297 7654]]

2V3 Test Confusion Matrix:
 [[  20 1080]
 [  14 8042]]



In [60]:
# Softmax Functions
def one_hot_encode(y):
    m_sample = len(y)
    classes = np.unique(y)
    c_class = len(classes)
    one_hot = np.zeros((m_sample, c_class), int)
    for i in range(m_sample):
        for j, c in enumerate(classes):
            if y[i] == c:
                one_hot[i][j] = 1
    return one_hot

def softmax_prediction(X, theta, norm=False):
    z = np.dot(X, theta)
    h = np.exp(z - np.max(z, axis=1, keepdims=True))
    yh = h / np.sum(h, axis=1, keepdims=True)
    if not norm:
        return yh
    else:
        return np.argmax(yh, axis=1, keepdims=True) + 1

def softmax_gradient(X, y, alpha, n_iter):
    m_sample, n_feature = X.shape
    c_class = len(np.unique(y))
    theta = np.random.rand(n_feature, c_class)
    one_hot = one_hot_encode(np.array(y))
    iter_cost = []
    for i in range(n_iter):
        pred = softmax_prediction(X, theta)
        change = np.dot(X.T, (pred - one_hot))
        theta = theta - alpha * np.array(change)
        cost = 0
        for c in range(c_class):
            cost += np.sum(one_hot[:, c] * np.log10(pred[:, c]))
        cost = -cost / m_sample
        iter_cost.append(cost)
    return theta, iter_cost[-1].round(2)

In [62]:
# Our Softmax
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

class_theta, cross_entropy = softmax_gradient(X_train, y_train, 0.01, 100)

yh_train = softmax_prediction(X_train, class_theta, True)
yh_test = softmax_prediction(X_test, class_theta, True)

acc_train = accuracy_score(np.array(y_train).reshape(-1, 1), yh_train)
acc_test = accuracy_score(np.array(y_test).reshape(-1, 1), yh_test)

print('Softmax Regression Result:')
print(f'Cross Entropy: {cross_entropy}')
print(f'Train Accuracy: {round(acc_train * 100, 2)} | Test Accuracy: {round(acc_test * 100, 2)}\n')

Softmax Regression Result:
Cross Entropy: 0.23
Train Accuracy: 78.99 | Test Accuracy: 78.41



In [64]:
print(f'Train Confusion Matrix:\n {confusion_matrix(y_train, yh_train)}\n')
print(f'Test Confusion Matrix:\n {confusion_matrix(y_test, yh_test)}\n')

Train Confusion Matrix:
 [[10715   226  2754]
 [ 1648   392  2442]
 [ 3208   280 28591]]

Test Confusion Matrix:
 [[2637   52  660]
 [ 423   83  639]
 [ 848   90 7132]]



In [72]:
# Scikit-Learn Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=250)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Train Accuracy Using Library: {round(train_accuracy * 100, 2)}')
print(f'Test Accuracy Using Library: {round(test_accuracy * 100, 2)}')
print()
print(f'Train Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}\n')
print(f'Test Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}\n')

Train Accuracy Using Library: 79.83
Test Accuracy Using Library: 79.46

Train Confusion Matrix:
[[ 9959    92  3644]
 [ 1288   219  2975]
 [ 2017   121 29941]]

Test Confusion Matrix:
[[2437   21  891]
 [ 337   45  763]
 [ 532   37 7501]]



In [66]:
# Naive Bayes Functions
def class_probability(label):
    m_sample = len(label)
    _, count = np.unique(label, return_counts=True)
    class_prob = count / m_sample
    return class_prob

def index_probability(data, label):
    unique, count = np.unique(label, return_counts=True)
    c_class = len(unique)
    m_sample = len(label)
    n_feature = 28 * 28
    index_prob = np.zeros((n_feature, c_class, 2))
    for m in range(m_sample):
        for i in range(300):
            if data[m][i] == 0:
                index_prob[i][label[m]][0] += 1
            elif data[m][i] != 0:
                index_prob[i][label[m]][1] += 1
    for n in range(n_feature):
        for c in range(c_class):
            for v in range(2):
                index_prob[n][c][v] = index_prob[n][c][v] / count[c]
    return index_prob

def naive_bayes_prediction(data, c_prob, i_prob):
    c_class = len(c_prob)
    m_sample = len(data)
    data_pred = np.zeros((m_sample, 1), int)
    for i, d in enumerate(data):
        pred_arr = c_prob.copy()
        for j in range(300):
            for m in range(c_class):
                if d[j] == 0:
                    pred_arr[m] *= i_prob[j][m][0]
                elif d[j] != 0:
                    pred_arr[m] *= i_prob[j][m][1]
        data_pred[i][0] = np.argmax(pred_arr)
    return data_pred

In [68]:
# Our Naive Bayes
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
y_train = np.array(y_train.astype(int) - 1)
y_test = np.array(y_test.astype(int) - 1)

class_prob = class_probability(y_train)
index_prob = index_probability(X_train.tolist(), y_train)

data_pred_train = naive_bayes_prediction(X_train, class_prob, index_prob)
data_pred_test = naive_bayes_prediction(X_test, class_prob, index_prob)

naive_bayes_acc_train = accuracy_score(y_train.reshape(-1, 1), data_pred_train.reshape(-1, 1))
naive_bayes_acc_test = accuracy_score(y_test.reshape(-1, 1), data_pred_test.reshape(-1, 1))

print('Train Accuracy: ' + str(round(naive_bayes_acc_train * 100, 2)))
print('Test Accuracy: ' + str(round(naive_bayes_acc_test * 100, 2)))

Train Accuracy: 77.79
Test Accuracy: 77.46


In [69]:
print(f'Train Confusion Matrix:\n {confusion_matrix(y_train, data_pred_train)}\n')
print(f'Test Confusion Matrix:\n {confusion_matrix(y_test, data_pred_test)}\n')

Train Confusion Matrix:
 [[ 9410   437  3848]
 [ 1214   613  2655]
 [ 2004  1006 29069]]

Test Confusion Matrix:
 [[2303  101  945]
 [ 298  156  691]
 [ 513  284 7273]]



In [78]:
# Scikit-Learn Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)

train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print(f'Train Accuracy {round(train_accuracy * 100, 2)}')
print(f'Test Accuracy {round(test_accuracy * 100, 2)}')
print()
print(f'Train Confusion Matrix:\n {confusion_matrix(y_train, clf.predict(X_train))}\n')
print(f'Test Confusion Matrix:\n {confusion_matrix(y_test, clf.predict(X_test))}\n')

Train Accuracy 75.4
Test Accuracy 75.45

Train Confusion Matrix:
 [[ 6477     2  7216]
 [  551    13  3918]
 [  668     7 31404]]

Test Confusion Matrix:
 [[1574    0 1775]
 [ 135    2 1008]
 [ 163    3 7904]]



In [17]:
# SVM Functions
def svm (X,Y,C):
    learning_rate = 0.01
    _samples,_features = X.shape
    w = np.zeros((1, _features))
    b = 0
    for i in range(600):
        gradw = 0
        gradb = 0
        for idx, x_i in enumerate(X):
            if Y[idx] * (np.dot(w, x_i.T) + b)> 1:
                gradw += 0
                gradb += 0
            else:
                gradw += C * Y[idx] * x_i
                gradb += C * Y[idx]  
        w = w - learning_rate * w + learning_rate * gradw
        b = b + learning_rate * gradb
    return w,b

def predict(X,w,b):
    prediction = np.dot(X, w[0]) + b 
    return np.sign(prediction)

In [20]:
# Our SVM
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

#Preparing Data
X_1v2 = X_train[np.where(y_train != 3)]
X_1v3 = X_train[np.where(y_train != 2)]
X_2v3 = X_train[np.where(y_train != 1)]

y_1v2 = y_train[y_train != 3]
y_1v3 = y_train[y_train != 2]
y_2v3 = y_train[y_train != 1]

y_1v2 = np.array([-1 if y == 1 else 1 for y in y_1v2]).reshape(-1, 1)
y_1v3 = np.array([-1 if y == 1 else 1 for y in y_1v3]).reshape(-1, 1)
y_2v3 = np.array([-1 if y == 2 else 1 for y in y_2v3]).reshape(-1, 1)

w1V2, b1v2 = svm(X_1v2, y_1v2, 100)
w1V3, b1v3 = svm(X_1v3, y_1v3, 100)
w2V3, b2v3 = svm(X_2v3, y_2v3, 100)

y_pre1v2 = predict(X_test, w1V2, b1v2)
y_pre1v3 = predict(X_test, w1V3, b1v3)
y_pre2v3 = predict(X_test, w2V3, b2v3)

y_pre1v2 = [1 if y == -1 else 2 for y in y_pre1v2]
y_pre1v3 = [1 if y == -1 else 3 for y in y_pre1v3]
y_pre2v3 = [2 if y == -1 else 3 for y in y_pre2v3]

# Maximum Majority 
y_pre = []
for y in range(len(y_pre1v2)):
    temp = []
    temp.append(y_pre1v2[y])
    temp.append(y_pre1v3[y])
    temp.append(y_pre2v3[y])
    label_1 = temp.count(1)
    label_2 = temp.count(2)
    label_3 = temp.count(3)
    if label_1 == max(label_1, label_2, label_3):
        y_pre.append(1)
        continue
    if label_2 == max(label_1, label_2, label_3):
        y_pre.append(2) 
        continue
    if label_3 == max(label_1, label_2, label_3):
        y_pre.append(3) 
        continue
        
accuracy = accuracy_score(y_test, y_pre)  

print("Accuracy: ", round(accuracy * 100, 2))

Accuracy:  64.68


In [21]:
print(confusion_matrix(y_test, y_pre))

[[  29  154 3166]
 [   1   91 1053]
 [   0   63 8007]]


In [24]:
# Scikit-Learn SVM
clf = svm.SVC(kernel='linear', decision_function_shape='ovr')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", round(accuracy * 100, 2))
print(confusion_matrix(y_test, y_pre))

Accuracy:  79.15
[[  29  154 3166]
 [   1   91 1053]
 [   0   63 8007]]
