In [3]:
import pandas as pd
import tensorflow as tf
import torch
import numpy as np
import time
import datetime
import random
import math
import seaborn as sns
import matplotlib.pyplot as plt

import transformers
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import sklearn
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [4]:
class TriTraining:
    def __init__(self, classifier):
        if sklearn.base.is_classifier(classifier):
            self.classifiers = [sklearn.base.clone(classifier) for i in range(3)]
        else:
            self.classifiers = [sklearn.base.clone(classifier[i]) for i in range(3)]
            
    def fit(self, L_X, L_y, U_X):
            
        for i in range(3):
            sample = sklearn.utils.resample(L_data, L_label)  # BootstrapSample(L)
            self.classifiers[i].fit(*sample)  # Learn(Si)   
        e_prime = [0.5]*3
        l_prime = [0]*3
        e = [0]*3
        update = [False]*3
        Li_X, Li_y = [[]]*3, [[]]*3#to save proxy labeled data
        improve = True
        self.iter = 0
        
        while improve:
            self.iter += 1#count iterations 
            
            for i in range(3):    
                j, k = np.delete(np.array([0,1,2]),i)
                update[i] = False
                e[i] = self.measure_error(L_X, L_y, j, k)
                if e[i] < e_prime[i]:
                    U_y_j = self.classifiers[j].predict(U_data)
                    U_y_k = self.classifiers[k].predict(U_data)
                    Li_X[i] = U_X[U_y_j == U_y_k]#when two models agree on the label, save it
                    Li_y[i] = U_y_j[U_y_j == U_y_k]
                    if l_prime[i] == 0:#no updated before
                        l_prime[i]  = int(e[i]/(e_prime[i] - e[i]) + 1)
                    if l_prime[i] <len(Li_y[i]):
                        if e[i]*len(Li_y[i])<e_prime[i] * l_prime[i]:
                            update[i] = True
                        elif l_prime[i] > e[i]/(e_prime[i] - e[i]):
                            L_index = np.random.choice(len(Li_y[i]), int(e_prime[i] * l_prime[i]/e[i] -1))#subsample from proxy labeled data
                            Li_X[i], Li_y[i] = Li_X[i][L_index], Li_y[i][L_index]
                            update[i] = True
             
            for i in range(3):
                if update[i]:
                    self.classifiers[i].fit(np.append(L_X,Li_X[i],axis=0), np.append(L_y, Li_y[i], axis=0))#train the classifier on integrated dataset
                    e_prime[i] = e[i]
                    l_prime[i] = len(Li_y[i])
    
            if update == [False]*3:
                improve = False#if no classifier was updated, no improvement


    def predict(self, X):
        pred = np.asarray([self.classifiers[i].predict(X) for i in range(3)])
        pred[0][pred[1]==pred[2]] = pred[1][pred[1]==pred[2]]
        return pred[0]
        
    def score(self, X, y):
        return sklearn.metrics.accuracy_score(y, self.predict(X))
        
    def measure_error(self, X, y, j, k):
        j_pred = self.classifiers[j].predict(X)
        k_pred = self.classifiers[k].predict(X)
        wrong_index =np.logical_and(j_pred != y, k_pred==j_pred)#model_j and model_k make the same wrong prediction
        #wrong_index =np.logical_and(j_pred != y_test, k_pred!=y_test)
        return sum(wrong_index)/sum(j_pred == k_pred)

In [18]:
class TriTrainingwDisagreement():

    def __init__(self, classifier):
        """
        args:
            classifier - classifier, with .fit, .predict API (refer to classifiers of sklearn)
        """
        # Initialize
        if sklearn.base.is_classifier(classifier):
            self.clf = [sklearn.base.clone(classifier) for i in range(3)]
        else:
            self.clf = [sklearn.base.clone(classifier[i]) for i in range(3)]

    def measure_error(self, j, k):
        """
        args:
                j - int, classifier index
                k - int, classifier index
        return:
                float, classification_error
        """
        y_predict_j = self.clf[j].predict(self.X_label)
        y_predict_k = self.clf[k].predict(self.X_label)
        return (1 - np.sum((y_predict_j == y_predict_k) & (y_predict_j == self.y_label)) / np.sum(y_predict_j == y_predict_k))

    def fit(self, X_label, y_label, X_unlabel):
        """
        args:
                X_label - labeled train feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
                y_label - labeled train label vector (ndarray of size, # of samples), labels are numeric numbers
                X_unlabel - test feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
        """        

        self.X_label = X_label
        self.y_label = y_label

        classification_error_current = [0.5, 0.5, 0.5]
        classification_error = [0.5, 0.5, 0.5]
        pseudo_label_size_current = [0, 0, 0]
        pseudo_label_size = [0, 0, 0]
        # pseudo_label_index used to compare and check if tri-training can be stopped, when two iterations have the same label_index, means tri-training can be stopped
        X_pseudo_label_index = [[], [], []]
        X_pseudo_label_index_current = [[], [], []]

        feature_size = self.X_label.shape[1]

        # Train each classifier with bootstrampped subset
        for i in range(3):
            X_resample, y_resample = sklearn.utils.resample(self.X_label, self.y_label)  # BootstrapSample(L)
            self.clf[i].fit(X_resample, y_resample)  # Learn(Si)

        iteration = 0
        while (True):

            update = [False, False, False]

            iteration = iteration + 1
            for i in range(3):
                X_pseudo_label_index_current[i] = X_pseudo_label_index[i]

            # Step3.1 Set Li = empty set, Li denotes the new pseudo label set determined by tri-training iteration for classifier i
            # X_pseudo_label_index, contains the data record index (in the full unlabelled set) of the new pseudo label set determined by tri-training iteration for classifier i
            # X_pseudo_label, contains the features for new pseudo label set determined by tri-training iteration for classifier i
            # y_pseudo_label, contains the labels (not ground truth label, but pseudo label calculated by tri-training iteration) for new pseudo label set determined by tri-training iteration for classifier i
            X_pseudo_label_index = [[], [], []]
            X_pseudo_label = [[], [], []]
            y_pseudo_label = [[], [], []]

            # Step 3.2 Loop through all the data record in unlabelled set
            for i in range(3):
                j, k = np.delete(np.array([0, 1, 2]), i)
                classification_error[i] = self.measure_error(j, k)
                if classification_error[i] < classification_error_current[i]:
                    # Step 3.2 If classifier j,k aggrees with the label for one data record, and not agree with classifier i, in unlabelled set,
                    # then add the data record into Li                    
                    y_predict_j = self.clf[j].predict(X_unlabel)
                    y_predict_k = self.clf[k].predict(X_unlabel)
                    y_predict_i = self.clf[i].predict(X_unlabel)
                    y_pseudo_label[i] = y_predict_j[np.logical_and(y_predict_j==y_predict_k,y_predict_j!=y_predict_i)]
                    X_pseudo_label_index[i] = np.where(np.logical_and(y_predict_j==y_predict_k,y_predict_j!=y_predict_i))
                    
                    pseudo_label_size[i] = len(X_pseudo_label_index[i])
                    #print("classification_error: {}, classification_error_current: {}, pseudo_label_size: {}, pseudo_label_size_current: {}".format(classification_error[i], classification_error_current[i], pseudo_label_size[i],pseudo_label_size_current[i]))

                    if pseudo_label_size_current[i] == 0:
                        pseudo_label_size_current[i] = math.floor(classification_error[i] / (classification_error_current[i] - classification_error[i]) + 1)
                    if pseudo_label_size_current[i] < pseudo_label_size[i]:
                        if ((classification_error[i] * pseudo_label_size[i]) < (classification_error_current[i] * pseudo_label_size_current[i])):
                            update[i] = True
                        elif pseudo_label_size_current[i] > (classification_error[i] / (classification_error_current[i] - classification_error[i])):
                            resample_size = math.ceil(classification_error_current[i] * pseudo_label_size_current[i] / classification_error[i] - 1)
                            X_pseudo_label_index[i], y_pseudo_label[i] = sklearn.utils.resample(X_pseudo_label_index[i],y_pseudo_label[i],replace=False,n_samples=resample_size)
                            pseudo_label_size[i] = len(X_pseudo_label_index[i])
                            update[i] = True

            # Step 3.3 Train all the three classifiers with Li + original labelled data set
            for i in range(3):
                if update[i] == True:
                    #print("number of pseudo labels added for classifier {} is: {}".format(i,len(X_pseudo_label_index[i])))
                    X_pseudo_label[i] = np.array(X_unlabel[X_pseudo_label_index[i]])
                    self.clf[i].fit(np.concatenate((X_pseudo_label[i], self.X_label), axis=0),np.concatenate((np.array(y_pseudo_label[i]), self.y_label), axis=0))
                    classification_error_current[i] = classification_error[i]
                    pseudo_label_size_current[i] = pseudo_label_size[i]

            # Stop tri-training process, if the pseudo label data set added in current tri-training iteration
            # is the same for last tri-training iteration for all classifiers
            if (np.array_equal(X_pseudo_label_index[0], X_pseudo_label_index_current[0]) & np.array_equal(X_pseudo_label_index[1], X_pseudo_label_index_current[1]) 
                    & np.array_equal(X_pseudo_label_index[2], X_pseudo_label_index_current[2])):
                break

    def predict(self, X_test):
        """
        args:
                X_test - test feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
        return:
                array of size (# of test samples), with values as predicted label 1 or 0
        """
        I = self.clf[0].predict(X_test)
        J = self.clf[1].predict(X_test)
        K = self.clf[2].predict(X_test)
        I[J == K] = J[J == K]
        return I

    def score(self, X_test, y_test):
        """
        args:
                X_test - test feature vector (ndarray of size, # of samples * # of features), features are numeric numbers
                y_test - test label vector (ndarray of size, # of samples), labels are numeric numbers
        return:
                float, accuracy_score of predicted value by the tri-training (with disagreement) classifier against groud truth
        """
        
        return sklearn.metrics.accuracy_score(y_test, self.predict(X_test))

In [5]:
classifier = {}

from sklearn import tree

classifier['DecisionTree'] = tree.DecisionTreeClassifier()

In [6]:
def data_process(data, label, rate, test_rate=0.2):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = test_rate, random_state=0)

    rng = np.random.RandomState(0)#to make same index every time
    labeled_index = rng.rand(len(y_train)) < rate#in training set, choose 20% as labeled data
    unlabeled_index = np.logical_not(labeled_index)
    L_data = X_train[labeled_index]#data of L
    L_label = y_train[labeled_index]#lable of L
    U_data = X_train[unlabeled_index]#data of U
    return L_data, L_label, U_data, X_test, y_test

# Haber Sınıflandırma

In [8]:
dataset = {}

data = pd.read_csv('C:/Users/Hilal KAYA/Downloads/turkish_text_data/turkish_text_data.csv', encoding='utf-8')
data['labels'] = pd.factorize(data.category)[0]

training = data.groupby('category').apply(lambda x : x.sample(frac = 0.5))

training_texts = training.text.values
training_labels = training.labels.values

In [9]:
training.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,category,text,labels
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dunya,1125,dunya,azerbaycan ve türkiye ortak ordu kursun azerb...,1
dunya,1396,dunya,rusya da şok suikast ! suriye ye askeri malze...,1
dunya,1268,dunya,mahalle arası çatışmalarda 3 ölü beyrut taki ...,1
dunya,1061,dunya,iskoçlar bağımsızlık istemiyor ingiltere ile ...,1
dunya,1224,dunya,suriye nin kimyasal silahlarıyla ilgili sert ...,1


In [10]:
training.shape

(2450, 3)

In [11]:
training.groupby("labels").count()

Unnamed: 0_level_0,category,text
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,350,350
1,350,350
2,350,350
3,350,350
4,350,350
5,350,350
6,350,350


In [12]:
training.describe()

Unnamed: 0,labels
count,2450.0
mean,3.0
std,2.000408
min,0.0
25%,1.0
50%,3.0
75%,5.0
max,6.0


In [13]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-uncased', do_lower_case=True)
sentences = data.text.values
max_len = 250

In [14]:

input_ids = []
attention_masks = []

for text in training_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True,
                        max_length = 250,      
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(training_labels)

dataset['bert'] = {'X': input_ids, 'y':labels}



In [75]:
results = None

for d in dataset:
    for c in classifier:       
        print('dataset:', d, dataset[d]['X'].shape)
        print('classifier:', c)
        print('label_rate:', r)
        error = np.zeros([4,20])
        for i in range(3):
            L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], 0.1)
            m1 = TriTraining(classifier[c])
            m1.fit(L_data, L_label, U_data)
            error[0, i] = 1-m1.score(X_test, y_test)  
            e = np.mean(error, axis = 1)
            print('TriTraining test error', e[0])
            print('TriTraining test score', 1 - e[0])
            
        test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
        errors = {'TriTraining': e[0]}
        if results is None:
            results = pd.DataFrame([{**test_info, **errors}])
        else:
            results.loc[len(results.index)] = {**test_info, **errors}

dataset: bert torch.Size([2398, 250])
classifier: DecisionTree
label_rate: 0.1
TriTraining test error 0.02572916666666667
TriTraining test score 0.9742708333333333
TriTraining test error 0.05062500000000001
TriTraining test score 0.949375
TriTraining test error 0.0765625
TriTraining test score 0.9234375


In [76]:
results = None

for d in dataset:
    for c in classifier:   
        print('dataset:', d, dataset[d]['X'].shape)
        print('classifier:', c)
        print('label_rate:', r)
        error = np.zeros([4,20])
        for i in range(3):
            L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], 0.1)
            m1 = TriTrainingwDisagreement(classifier[c])
            m1.fit(L_data, L_label, U_data)
            error[0, i] = 1-m1.score(X_test, y_test)    
            e = np.mean(error, axis = 1)
            print('TriTraining Disagree test error', e[0])
            print('TriTraining Disagree test score', 1 - e[0])
            
        test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
        errors = {'TriTraining Disagree': e[0]}
        if results is None:
            results = pd.DataFrame([{**test_info, **errors}])
        else:
            results.loc[len(results.index)] = {**test_info, **errors}

dataset: bert torch.Size([2398, 250])
classifier: DecisionTree
label_rate: 0.1
TriTraining Disagree test error 0.022604166666666665
TriTraining Disagree test score 0.9773958333333334
TriTraining Disagree test error 0.04802083333333333
TriTraining Disagree test score 0.9519791666666667
TriTraining Disagree test error 0.07364583333333333
TriTraining Disagree test score 0.9263541666666667


# Duygudurum Analizi

In [116]:
data=pd.read_csv("C:/Users/Hilal KAYA/Downloads/sentiment/train.csv",index_col=[0],encoding="windows-1252")
data.head()

Unnamed: 0,comment,Label
0,biri bana bu filmde benim anlamadigim bisey ol...,0
1,ya çocuklar ilk filmin sonunda büyüdüler ya bu...,1
2,film biraz daha uzun sürse harbi kiyameti göre...,0
3,pek orjinal bi cinayet yok ama orjinal oyuncul...,0
4,film tek kelimeyle muhtesemdi heleki sonundaki...,1


In [121]:
training = data.groupby('Label').apply(lambda x : x.sample(frac = 0.3))
training_texts = training.comment.values
training_labels = training.Label.values
print(training_labels)

[0 0 0 ... 1 1 1]


In [81]:
training.shape

(2398, 2)

In [82]:

input_ids = []
attention_masks = []

for text in training_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True,
                        max_length = 250,      
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(training_labels)

dataset['bert'] = {'X': input_ids, 'y':labels}



In [83]:
print(dataset)

{'bert': {'X': tensor([[    2, 22467,  4767,  ...,     0,     0,     0],
        [    2, 10367,  2013,  ...,     0,     0,     0],
        [    2, 10520,  3371,  ...,     0,     0,     0],
        ...,
        [    2,  8708, 19137,  ...,     0,     0,     0],
        [    2,  2083,  3776,  ...,     0,     0,     0],
        [    2, 26382,  6191,  ...,     0,     0,     0]]), 'y': tensor([0, 0, 0,  ..., 1, 1, 1])}}


In [84]:
results = None

for d in dataset:
    for c in classifier:       
        print('dataset:', d, dataset[d]['X'].shape)
        print('classifier:', c)
        print('label_rate:', r)
        error = np.zeros([4,20])
        for i in range(3):
            L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], 0.1)
            m1 = TriTraining(classifier[c])
            m1.fit(L_data, L_label, U_data)
            error[0, i] = 1-m1.score(X_test, y_test)  
            e = np.mean(error, axis = 1)
            print('TriTraining test error', e[0])
            print('TriTraining test score', 1 - e[0])
            
        test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
        errors = {'TriTraining': e[0]}
        if results is None:
            results = pd.DataFrame([{**test_info, **errors}])
        else:
            results.loc[len(results.index)] = {**test_info, **errors}

dataset: bert torch.Size([2398, 250])
classifier: DecisionTree
label_rate: 0.1
TriTraining test error 0.025312499999999998
TriTraining test score 0.9746875
TriTraining test error 0.0503125
TriTraining test score 0.9496875
TriTraining test error 0.07510416666666667
TriTraining test score 0.9248958333333334


In [85]:
results = None

for d in dataset:
    for c in classifier:   
        print('dataset:', d, dataset[d]['X'].shape)
        print('classifier:', c)
        print('label_rate:', r)
        error = np.zeros([4,20])
        for i in range(3):
            L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], 0.1)
            m1 = TriTrainingwDisagreement(classifier[c])
            m1.fit(L_data, L_label, U_data)
            error[0, i] = 1-m1.score(X_test, y_test)    
            e = np.mean(error, axis = 1)
            print('TriTraining Disagree test error', e[0])
            print('TriTraining Disagree test score', 1 - e[0])
            
        test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
        errors = {'TriTraining Disagree': e[0]}
        if results is None:
            results = pd.DataFrame([{**test_info, **errors}])
        else:
            results.loc[len(results.index)] = {**test_info, **errors}

dataset: bert torch.Size([2398, 250])
classifier: DecisionTree
label_rate: 0.1
TriTraining Disagree test error 0.02364583333333333
TriTraining Disagree test score 0.9763541666666666
TriTraining Disagree test error 0.04895833333333333
TriTraining Disagree test score 0.9510416666666667
TriTraining Disagree test error 0.07447916666666668
TriTraining Disagree test score 0.9255208333333333


# Toxic Comment

In [136]:
data=pd.read_csv("C:/Users/Hilal KAYA/Downloads/toxicComment/toxicComment.csv",index_col=[0],encoding="utf-8")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001d958c54c6e35,"Sen, efendim, benim kahramanımsın. Hangi sayfa...",0,0,0,0,0,0
1,0002bcb3da6cb337,KADIN ÇALIŞMAYA ÇALIŞMADAN ÖNCE COCKSUCKER,1,0,0,0,0,0
2,00070ef96486d6f9,"Oh, ve yukarıdaki kız benimle tartışmalara baş...",0,0,0,0,0,0
3,000113f07ec002fd,"Hey adamım, gerçekten savaşı düzenlemeye çalış...",0,0,0,0,0,0
4,00173958f46763a2,TFD\n\nSanýrým sadece düţündük. Sanırım birbir...,0,0,0,0,0,0


In [137]:
train_cat = pd.concat([
    data[['comment_text', 'toxic']], 
]).sample(n=3000).reset_index(drop=True)

train_data = train_cat
train_data.shape

(3000, 2)

In [138]:
training_texts = train_data.comment_text.values
training_labels = train_data.toxic.values
print(training_labels)

[0 0 0 ... 0 1 0]


In [139]:
input_ids = []
attention_masks = []

for text in training_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True,
                        max_length = 250,      
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(training_labels)

dataset['bert'] = {'X': input_ids, 'y':labels}



In [140]:
print(dataset)

{'bert': {'X': tensor([[    2, 18467,  2536,  ...,     0,     0,     0],
        [    2,    30,  3716,  ...,     0,     0,     0],
        [    2,  2011,  7959,  ...,     0,     0,     0],
        ...,
        [    2,  2737, 31874,  ...,     0,     0,     0],
        [    2,     6,    30,  ...,     0,     0,     0],
        [    2,     6,    30,  ...,     0,     0,     0]]), 'y': tensor([0, 0, 0,  ..., 0, 1, 0])}}


In [141]:
results = None

for d in dataset:
    for c in classifier:       
        print('dataset:', d, dataset[d]['X'].shape)
        print('classifier:', c)
        print('label_rate:', r)
        error = np.zeros([4,20])
        for i in range(3):
            L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], 0.1)
            m1 = TriTraining(classifier[c])
            m1.fit(L_data, L_label, U_data)
            error[0, i] = 1-m1.score(X_test, y_test)  
            e = np.mean(error, axis = 1)
            print('TriTraining test error', e[0])
            print('TriTraining test score', 1 - e[0])
            
        test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
        errors = {'TriTraining': e[0]}
        if results is None:
            results = pd.DataFrame([{**test_info, **errors}])
        else:
            results.loc[len(results.index)] = {**test_info, **errors}

dataset: bert torch.Size([3000, 250])
classifier: DecisionTree
label_rate: 0.1
TriTraining test error 0.0052499999999999995
TriTraining test score 0.99475
TriTraining test error 0.012166666666666664
TriTraining test score 0.9878333333333333
TriTraining test error 0.01783333333333333
TriTraining test score 0.9821666666666666


In [142]:
results = None

for d in dataset:
    for c in classifier:   
        print('dataset:', d, dataset[d]['X'].shape)
        print('classifier:', c)
        print('label_rate:', r)
        error = np.zeros([4,20])
        for i in range(3):
            L_data, L_label, U_data, X_test, y_test = data_process(dataset[d]['X'], dataset[d]['y'], 0.1)
            m1 = TriTrainingwDisagreement(classifier[c])
            m1.fit(L_data, L_label, U_data)
            error[0, i] = 1-m1.score(X_test, y_test)    
            e = np.mean(error, axis = 1)
            print('TriTraining Disagree test error', e[0])
            print('TriTraining Disagree test score', 1 - e[0])
            
        test_info = {'dataset': d+str(dataset[d]['X'].shape), 'classifier': c, 'label_rate': r}
        errors = {'TriTraining Disagree': e[0]}
        if results is None:
            results = pd.DataFrame([{**test_info, **errors}])
        else:
            results.loc[len(results.index)] = {**test_info, **errors}

dataset: bert torch.Size([3000, 250])
classifier: DecisionTree
label_rate: 0.1
TriTraining Disagree test error 0.008000000000000002
TriTraining Disagree test score 0.992
TriTraining Disagree test error 0.014083333333333337
TriTraining Disagree test score 0.9859166666666667
TriTraining Disagree test error 0.0195
TriTraining Disagree test score 0.9805
