In [1]:
import sys
from PyQt5 import QtWidgets as qtw
from PyQt5 import QtCore as qtc
from PyQt5 import QtGui as qtg

from PyQt5.QtWidgets import (QTableWidget, QTableWidgetItem, QTableView)
from PyQt5.QtCore import QAbstractTableModel, Qt, QModelIndex

import seaborn as sns
from matplotlib.backends.backend_qt5agg import FigureCanvas
from matplotlib.figure import Figure

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

from classifier_tab import ClassifierTab
from feature_tab import FeatureExtractionTab
from test_section import TestSection

naive_bayes_params = {
    "fit_prior": True,
    "alpha": 1,
}
naive_bayes_type = MultinomialNB

train_labels = []
train_msg = []

test_label = []
test_msg = []

feature_params = {}

count_vect = {}
train_counts = {}
training_words = {}

classifier = {}

class MainWindow(qtw.QWidget):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        selected_classifier = qtc.pyqtSignal()
        knn_params = qtc.pyqtSignal(dict)

        sns.set_theme(style="darkgrid")

        self.training_upload_section()
        # TestSection(self)
        self.setUpMainTab()
        self.widget_layout()

        self.show()

    def training_upload_section(self):
        self.textedit = qtw.QTextEdit()

        self.info_text = qtw.QLabel("<h3>Training Section<h3>")
        self.info_text.setStyleSheet("""
            font-size: 24px;
        """)

        self.upload_button = qtw.QPushButton('Upload Training Data Set')
        # self.upload_button.setFixedSize(100, 40)
        self.upload_button.setStyleSheet("QPushButton"
                             "{"
                             "background-color : #2ABf9E; padding: 10px; color: white;"
                             "}"
                             "QPushButton::pressed"
                             "{"
                             "background-color : grey;"
                             "}"
                             )

        self.upload_button.clicked.connect(self.open)

        self.create_table()


    def create_table(self):
        self.traintableView = QTableWidget()
        self.traintableView.setFixedHeight(150)
        self.traintableView.setColumnCount(2)
        self.traintableView.setRowCount(100)
        self.traintableView.setHorizontalHeaderLabels(["Label", "Text"])
        self.traintableView.horizontalHeader().setStretchLastSection(True)


    def widget_layout(self):
        layout = qtw.QHBoxLayout()
        layout.setSpacing(50)

        self.left_layout = qtw.QVBoxLayout()
        # self.left_layout.setFixedWidth(550)
        right_layout = qtw.QVBoxLayout()
        # right_layout.setFixedWidth(550)

        self.left_layout.addWidget(self.info_text)
        self.left_layout.addWidget(self.traintableView)
        self.left_layout.addWidget(self.upload_button)

        self.left_layout.addWidget(self.tabwidget)

        # right_layout.addWidget(TestSection(self))

        left_widget = qtw.QWidget()
        left_widget.setLayout(self.left_layout)

        # right_widget = qtw.QWidget()
        # right_widget.setLayout(right_layout)

        self.test_section = TestSection(self)

        layout.addWidget(left_widget)
        layout.addWidget(self.test_section)

        # main_widget = qtw.QWidget()
        # main_widget.setLayout(layout)

        self.setLayout(layout)
        # self.setCentralWidget(main_widget)


    def setUpMainTab(self):
        self.tabwidget = qtw.QTabWidget()
        self.featureTab = FeatureExtractionTab(self)
        self.featureTab.submitted.connect(self.handle_feature_params)

        self.classifierTab = ClassifierTab(self)
        self.classifierTab.naive_bayes_submitted.connect(self.handle_naive_bayes_submit)
        self.classifierTab.knn_submitted.connect(self.handle_knn_submit)
        self.classifierTab.dt_submitted.connect(self.handle_dt_submit)
        self.classifierTab.lvc_submitted.connect(self.handle_lvc_submit)
        self.classifierTab.svc_submitted.connect(self.handle_svc_submit)

        self.tabwidget.addTab(self.featureTab, "Feature extraction")
        self.tabwidget.addTab(self.classifierTab, "Classifiers")

        self.tabwidget.tabBar().setTabTextColor(0, Qt.black)
        self.tabwidget.tabBar().setTabTextColor(1, Qt.black)
        
        
    def open(self):
        filename, vv = qtw.QFileDialog.getOpenFileName(filter="*.csv *.txt")
        self.hasHeading = True
        self.delimeter = ','
        labelColumn = 0
        textColumn = 1
        def checkFileHeadings():
            items = ("Yes", "No")

            item, ok = qtw.QInputDialog.getItem(self, "Does your file have headings?", 
             "Does your file have headings?", items, 0, False)

            if ok and item:
                print(item)
                if item == 'Yes':
                    print('got here')
                    self.hasHeading = True
                else:
                    print('got here 2')
                    self.hasHeading = False
                
                checkDelimiter()
                                
            
        def checkDelimiter():
            items = ("Comma Seperated", "Tab Seperated", "Semicolon Seperated", "Colon Seperated")

            item, ok = qtw.QInputDialog.getItem(self, "Type of delimiter", 
             "Type of delimiter", items, 0, False)

            if ok and item:
                # print(item)
                if item == 'Comma Seperated':
                    self.delimeter = ','
                elif item == 'Tab Seperated':
                    self.delimeter = 't'
                elif item == 'Semicolon Seperated':
                    self.delimeter = ';'
                elif item == 'Colon Seperated':
                    self.delimeter = ':'
                
                getLabelColumn()

        def getLabelColumn():
            num,ok = qtw.QInputDialog.getInt(self,"What column is the label","What column is the label - 0 indexed")

            if ok:
                # print(num)
                self.labelColumn = num
                getTextColumn()
                 

        def getTextColumn():
            num,ok = qtw.QInputDialog.getInt(self,"What column is the text","What column is the text - 0 indexed")

            if ok:
                print(num)
                if self.labelColumn == num:
                    qtw.QMessageBox.about(self, "Error", "Cannot be the same value as label column")
                else:
                    self.textColumn = num

        if filename:
            file = open(filename, 'r')
            checkFileHeadings()
            print(self.hasHeading)
            print(self.delimeter)
            print(self.labelColumn)
            print(self.textColumn)
            if self.hasHeading:
                first = file.read(1)

            lines = file.readlines()
            # we want to split the data to 70% training and 30% test
            len_of_train_data = round(len(lines) * 0.7)
            i = 0
            train_labels.clear()
            train_msg.clear()
            test_label.clear()
            test_msg.clear()
            for line in lines:
                if self.delimeter == 't':
                    splitLine = line.split('\t')
                else:
                    splitLine = line.split(self.delimeter)
                item_label = splitLine[self.labelColumn]
                item_msg = splitLine[self.textColumn]
                if (i < len_of_train_data):
                    train_labels.append(item_label)
                    train_msg.append(item_msg)
                else:
                    test_label.append(item_label)
                    test_msg.append(item_msg)
                if i < 100:
                    self.traintableView.setItem(i, 0, QTableWidgetItem(item_label))
                    self.traintableView.setItem(i, 1, QTableWidgetItem(item_msg))
                i += 1
                # if i > 4:
                #     break

            # if len(train_msg):
                


    def train_dataset(self):
        # print(feature_params)
        count_vect['value'] = CountVectorizer(**feature_params)
        # print(count_vect)
        train_counts['value'] = count_vect['value'].fit_transform(train_msg)
        training_words['value'] = count_vect['value'].get_feature_names()
        print("train_counts", train_counts)
        # print(training_words)


        # self.h_layout = qtw.QHBoxLayout()
        # self.h_layout.setSpacing(4)
        # self.h_layout.setContentsMargins(4, 4, 4, 4)
        # feature_widget = qtw.QWidget()
        # feature_widget.setLayout(self.h_layout)

        # gh = qtw.Qtwev

        # for word in training_words:
        #     self.add_tag_to_bar(word)
        
        # self.left_layout.addWidget(feature_widget)

    def add_tag_to_bar(self, text):
        label = qtw.QLabel(text)
        label.setStyleSheet('border:1px solid rgb(192, 192, 192); border-radius: 4px;')
        # label.setFixedHeight(16)
        label.setFixedSize(50, 20)
        self.h_layout.addWidget(label)
        label.setSizePolicy(qtw.QSizePolicy.Maximum, qtw.QSizePolicy.Preferred)
        # self.h_layout.addWidget(tag)


    @qtc.pyqtSlot(str, str, bool, bool, str, str, str, str, str)
    def handle_feature_params(
        self,
        strip_accents,
        analyzer,
        lowercase,
        binary,
        # stop_words,
        token_pattern,
        ngram_range,
        max_df,
        min_df,
        max_features
        ):
        if (strip_accents):
            feature_params['strip_accents'] = strip_accents
        if (analyzer):
            feature_params['analyzer'] = analyzer
        feature_params['lowercase'] = lowercase
        feature_params['binary'] = binary
        # if stop_words:
        #     feature_params['stop_words'] = stop_words
        if token_pattern:
            feature_params['token_pattern'] = token_pattern
        if ngram_range:
            (min, max) = ngram_range.split(',')
            feature_params['ngram_range'] = (int(min), int(max))
        if max_df:
            feature_params['max_df'] = max_df
        if min_df:
            feature_params['min_df'] = min_df
        if max_features:
            feature_params['max_features'] = max_features

        if (len(train_labels)):
            self.train_dataset()
            qtw.QMessageBox.about(self, "Success", "Your settings have been saved!")
        else:
            qtw.QMessageBox.about(self, "Error", "Please upload training data set")


    @qtc.pyqtSlot(str, bool, str)
    def handle_naive_bayes_submit(
        self,
        type,
        fit_prior,
        alpha,
        ):
        if (len(train_labels)):
            if 'value' not in count_vect:
                self.train_dataset()

            naive_bayes_params['fit_prior'] = fit_prior
            if alpha:
                naive_bayes_params['alpha'] = float(alpha)

            if (type == 'Complement'):
                classifier['value'] = ComplementNB(**naive_bayes_params)
            else:
                classifier['value'] = MultinomialNB(**naive_bayes_params)

            classifier['value'].fit(train_counts['value'], train_labels)
            self.update_test_section_data()
        else:
            qtw.QMessageBox.about(self, "Error", "Please upload training data set")
        
        
    @qtc.pyqtSlot(str, str, str, str, str)
    def handle_knn_submit(
        self,
        weights,
        algorithm,
        n_neighbors,
        leaf_size,
        p,
        ):
        if (len(train_labels)):
            if 'value' not in count_vect:
                self.train_dataset()

            knn_params = {}
            if n_neighbors:
                knn_params['n_neighbors'] = int(n_neighbors)
            knn_params['weights'] = weights
            knn_params['algorithm'] = algorithm
            if leaf_size:
                knn_params['leaf_size'] = int(leaf_size)

            print("knn_params -----", knn_params)
            if p:
                knn_params['p'] = int(p)

            classifier['value'] = KNeighborsClassifier(**knn_params)
            classifier['value'].fit(train_counts['value'], train_labels)
            self.update_test_section_data()
        else:
            qtw.QMessageBox.about(self, "Error", "Please upload training data set")
           
    @qtc.pyqtSlot(str, str, str, str, str, str, str, str)
    def handle_dt_submit(
        self,
        criterion,
        splitter,
        max_depth,
        min_samples_split,
        min_samples_leaf,
        max_features,
        random_state,
        max_leaf_nodes,
        ):
        if (len(train_labels)):
            if 'value' not in count_vect:
                self.train_dataset()
            dt_params = {}
            dt_params['criterion'] = criterion
            dt_params['splitter'] = splitter
            if max_depth:
                dt_params['max_depth'] = int(max_depth)
            if min_samples_split:
                dt_params['min_samples_split'] = float(min_samples_split)
            if min_samples_leaf:
                dt_params['min_samples_leaf'] = float(min_samples_leaf)
            if max_features:
                dt_params['max_features'] = float(max_features)
            if random_state:
                dt_params['random_state'] = int(random_state)
            if max_leaf_nodes:
                dt_params['max_leaf_nodes'] = int(max_leaf_nodes)

            classifier['value'] = tree.DecisionTreeClassifier(**dt_params)
            classifier['value'].fit(train_counts['value'], train_labels)
            print('classifier------', classifier['value'])
            self.update_test_section_data()
        
        else:
            qtw.QMessageBox.about(self, "Error", "Please upload training data set")
            
    
    @qtc.pyqtSlot(str, str, str, str, bool, str)       
    def handle_lvc_submit(
        self,
        regularization,
        tolerance,
        penalty,
        loss,
        dual,
        multi_class
    ):
        if (len(train_labels)):
            if 'value' not in count_vect:
                self.train_dataset()

            lvc_params = {}
            lvc_params['penalty'] = penalty
            lvc_params['loss'] = loss
            lvc_params['dual'] = dual
            lvc_params['multi_class'] = multi_class
            if regularization:
                lvc_params['C'] = float(regularization)
            if tolerance:
                lvc_params['tol'] = float(tolerance)

            classifier['value'] = svm.LinearSVC(**lvc_params)
            classifier['value'].fit(train_counts['value'], train_labels)
            print('classifier------', classifier['value'])
            self.update_test_section_data()
        
        else:
            qtw.QMessageBox.about(self, "Error", "Please upload training data set")
            
            
    @qtc.pyqtSlot(str, str, str, str, str, str, bool, bool, str)       
    def handle_svc_submit(
        self,
        regularization,
        tolerance,
        kernel,
        degree,
        gamma,
        coeff,
        shrinking,
        probability,
        decision_function_shape
    ):
        if (len(train_labels)):
            if 'value' not in count_vect:
                self.train_dataset()

            svc_params = {}
            svc_params['gamma'] = gamma
            svc_params['shrinking'] = shrinking
            svc_params['probability'] = probability
            svc_params['kernel'] = kernel
            svc_params['decision_function_shape'] = decision_function_shape
            if regularization:
                svc_params['C'] = float(regularization)
            if tolerance:
                svc_params['tol'] = float(tolerance)
            if degree:
                svc_params['degree'] = float(degree)
            if coeff:
                svc_params['coef0'] = float(coeff)
                
                
            print(svc_params)

            classifier['value'] = svm.SVC(**svc_params)
            classifier['value'].fit(train_counts['value'], train_labels)
            print('classifier------', classifier['value'])
            self.update_test_section_data()
        
        else:
            qtw.QMessageBox.about(self, "Error", "Please upload training data set")
            
        

    def update_test_section_data(self):
        self.test_section.update_variables(
            test_label,
            test_msg,
            count_vect,
            train_counts,
            training_words,
            classifier
        )

if __name__ == '__main__':
    app = qtw.QApplication(sys.argv)
    w = MainWindow()
    w.setFixedWidth(1200)
    w.setFixedHeight(900)
    # w.setGeometry(0, 0, 1200, 400)
    app.exec_()

1
No
got here 2
2
False
t
0
2

word
train_counts {'value': <2800x7389 sparse matrix of type '<class 'numpy.int64'>'
	with 101843 stored elements in Compressed Sparse Row format>}
<PyQt5.QtWidgets.QWidget object at 0x7f966bc8c3a0>
1




classifier------ LinearSVC()

 LinearSVC()
{'1': {'precision': 0.38524590163934425, 'recall': 0.5949367088607594, 'f1-score': 0.46766169154228854, 'support': 237}, '2': {'precision': 0.2066115702479339, 'recall': 0.176056338028169, 'f1-score': 0.19011406844106465, 'support': 142}, '3': {'precision': 0.7896213183730715, 'recall': 0.6857490864799025, 'f1-score': 0.7340286831812255, 'support': 821}, 'accuracy': 0.6075, 'macro avg': {'precision': 0.4604929300867832, 'recall': 0.4855807111229436, 'f1-score': 0.4639348143881929, 'support': 1200}, 'weighted avg': {'precision': 0.6407676867066858, 'recall': 0.6075, 'f1-score': 0.6170579729216165, 'support': 1200}}
-------      labels   category
0         1  Predicted
1         3  Predicted
2         1  Predicted
3         3  Predicted
4         2  Predicted
...     ...        ...
2395      1     Actual
2396      3     Actual
2397      3     Actual
2398      1     Actual
2399      3     Actual

[2400 rows x 2 columns]
classifier------ DecisionT

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



 MultinomialNB(alpha=3.0)
['3']
Yes
got here
3
True
,
1
3

word
train_counts {'value': <2800x6318 sparse matrix of type '<class 'numpy.int64'>'
	with 28121 stored elements in Compressed Sparse Row format>}
classifier------ LinearSVC()

 LinearSVC()
{'"anger"': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 8}, '"boredom"': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7}, '"empty"': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 19}, '"enthusiasm"': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 18}, '"fun"': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 30}, '"happiness"': {'precision': 0.12, 'recall': 0.06, 'f1-score': 0.08, 'support': 50}, '"hate"': {'precision': 0.05405405405405406, 'recall': 0.034482758620689655, 'f1-score': 0.042105263157894736, 'support': 58}, '"love"': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 46}, '"neutral"': {'precision': 0.2356687898089172, 'recall': 0.32456140

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



 LinearSVC()
['"neutral"']

 LinearSVC()
['"hate"']
