# This notebook generates all the QWK scores for set 1 to 8 using the features csv files found in 'features' directory with the include of domain adapation

All implementation are obtained from maes.ipynb

Domain adaptation methods used below: 
- SourceOnly 
- TargetOnly
- EasyAdapt 

In [51]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm #SVR is in SVM
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split, KFold

# NB, SVM and BLRR model qwk scores generation 

Put here for referencing purpose

In [52]:
def qwk_nb(x_train, x_test, y_train, y_test):
    # Preprocess
    x_trainNB = x_train
    y_trainNB = y_train
    x_testNB = x_test
    y_testNB = y_test

    # Fit the model
    model_nb = naive_bayes.MultinomialNB()
    model_nb.fit(x_trainNB, y_trainNB.ravel())
    
    # Get predicted scores
    y_predNB = model_nb.predict(x_testNB)
    
    # Get QWK score
    score = cohen_kappa_score(y_test, y_predNB, weights="quadratic")
    return score, y_predNB

In [53]:
def qwk_svm(x_train, x_test, y_train, y_test):
    # Preprocess
    sc_Xsvm = StandardScaler()
    sc_ysvm = StandardScaler()
    x_trainSVM = sc_Xsvm.fit_transform(x_train)
    y_trainSVM = sc_ysvm.fit_transform(y_train)
    x_testSVM = sc_Xsvm.transform(x_test)
    y_testSVM = sc_ysvm.transform(y_test)
    
    # Fit the model
    from sklearn.svm import SVR
    model_svm = SVR(kernel='rbf', gamma='auto', verbose=True)
    model_svm.fit(x_trainSVM, y_trainSVM.ravel())
    
    # Get predicted scores
    y_predSVM = model_svm.predict(x_testSVM)
    y_predSVM = sc_ysvm.inverse_transform(y_predSVM).round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predSVM, weights="quadratic")
    return score, y_predSVM

In [54]:
def qwk_blrr(x_train, x_test, y_train, y_test):
    # Preprocess
    sc_Xblrr = StandardScaler()
    sc_yblrr = StandardScaler()
    x_trainBLRR = sc_Xblrr.fit_transform(x_train)
    y_trainBLRR = sc_yblrr.fit_transform(y_train)
    x_testBLRR = sc_Xblrr.transform(x_test)
    y_testBLRR = sc_yblrr.transform(y_test)
    
    # Fit the model
    from sklearn import linear_model
    model_blrr = linear_model.BayesianRidge()
    model_blrr.fit(x_trainBLRR, y_trainBLRR.ravel())
    
    # Get predicted scores
    y_predBLRR = model_blrr.predict(x_testBLRR)
    y_predBLRR = sc_yblrr.inverse_transform(y_predBLRR).round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predBLRR, weights="quadratic")
    return score, y_predBLRR

# SourceOnly

Disclaimer: not too sure of the algorithm used by the SourceOnly to obtain the QWK score, the approach below tries to use all 3 methods to obtain the scores, but have some difference with the Phandi's scores

Since target set is not used, so 4-fold cross validation (10, 25, 50, 100) can't be used

Probably needs some refinement

In [55]:
def source_only(source_set, target_set):
    # target_set is omitted as only source is used
    source_file = 'features/features_set' + str(source_set) + '.csv'
    source_dataset = pd.read_csv(source_file)
    
    # Reshape data and model (source)
    X_train = source_dataset.iloc[:,:12].values.astype(float)
    y_train = source_dataset.iloc[:,14].values.astype(float)
    y_train = np.array(y_train).reshape(-1,1)
 
    # Getting the QWK scores for all methods
    nb_score, _ = qwk_nb(X_train, X_train, y_train, y_train)
    svm_score, _ = qwk_svm(X_train, X_train, y_train, y_train)
    blrr_score, _ = qwk_blrr(X_train, X_train, y_train, y_train)
    
    return source_set, target_set, nb_score, svm_score, blrr_score

In [64]:
scores = []
for source_no in range(1, 9, 2):
    scores.append(source_only(source_no, source_no+1))

[LibSVM][LibSVM][LibSVM][LibSVM]

In [86]:
scores_df = pd.DataFrame(scores, columns=['Source', 'Target', 'BLRR', 'SVM', 'NB'])
scores_df

Unnamed: 0,Source,Target,BLRR,SVM,NB
0,1,2,0.644863,0.846994,0.81029
1,3,4,0.646221,0.713046,0.645305
2,5,6,0.659724,0.987627,0.984906
3,7,8,0.688648,0.991209,0.989674


### Averages and Median of SourceOnly with 11 rounds

The average and median of SourceOnly are the same because SourceOnly uses only the source for both training and testing data, so all executions will produce the same results

# TargetOnly

In [66]:
"""
Function to generate sub samples of 10, 25, 50, 100
"""
def sub_sample(data_set, size, selected_set):
    net_size = size - len(selected_set)
    for i in range(net_size):
        found = False
        while not found:
            index = random.choice(data_set)
            if index not in selected_set:
                selected_set.append(index)
                found = True
    return selected_set

In [103]:
"""
Function to obtain the qwk scores of 5 folds for target only based on the given algo (the model to be used)
"""
def target_only(source_set, target_set, algo_function):
    # source_set is omitted as only target is used
    target_file = 'features/features_set' + str(target_set) + '.csv'
    target_dataset = pd.read_csv(target_file)
    
    # Reshape data and model (target)
    X = target_dataset.iloc[:,:12].values.astype(float)
    y = target_dataset.iloc[:,14].values.astype(float)
    y = np.array(y).reshape(-1,1)
    
    # Apply 5-fold 
    cv = KFold(n_splits=5)
    sizes = [10, 25, 50, 100]
    all_scores = []
    
    # Split into train and test for the 5-fold
    for train_index, test_index in cv.split(X):
        
        # Extract one fold of testing data
        X_test, y_test = X[test_index], y[test_index]

        # Sub-sample four folds of training data
        sub_sample_index = []
        scores = []
        for size in sizes:
            sub_sample_index = sub_sample(train_index, size, sub_sample_index)
            X_train, y_train = X[sub_sample_index], y[sub_sample_index]
            
            # Getting the QWK scores for the given algo function
            score, _ = algo_function(X_train, X_test, y_train, y_test)
            scores.append(score)
            
        all_scores.append(scores)
    
    # Averaging the qwk scores for the 5 folds
    averages = np.array(all_scores).mean(axis=0)
    
    return source_set, target_set, averages[0], averages[1], averages[2], averages[3]
#     return averages[0], averages[1], averages[2], averages[3]

### TargetOnly for BLRR, SVM and NB methods

Not too sure which approach was used in the Phandi's paper hence decided to try all approaches. 

Can further refine if needed.

In [68]:
blrr_scores = []
svm_scores = []
nb_scores = []
for source_no in range(1, 9, 2):
    blrr_scores.append(target_only(source_no, source_no+1, qwk_blrr))
    svm_scores.append(target_only(source_no, source_no+1, qwk_svm))
    nb_scores.append(target_only(source_no, source_no+1, qwk_nb))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [69]:
blrr_scores_df = pd.DataFrame(blrr_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
blrr_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.464187,0.552943,0.606813,0.604634
1,3,4,0.927789,0.979212,0.983586,0.98636
2,5,6,0.904089,0.96926,0.975456,0.976739
3,7,8,0.7579,0.872741,0.905837,0.902182


In [29]:
svm_scores_df = pd.DataFrame(svm_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
svm_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.233563,0.432365,0.493954,0.559918
1,3,4,0.587968,0.784571,0.862818,0.927994
2,5,6,0.433688,0.767061,0.873009,0.914255
3,7,8,0.58059,0.796762,0.869759,0.892172


In [30]:
nb_scores_df = pd.DataFrame(nb_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
nb_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.208029,0.415491,0.453854,0.462514
1,3,4,0.407002,0.443619,0.545688,0.60382
2,5,6,0.160373,0.296643,0.33481,0.421657
3,7,8,0.209542,0.169272,0.161353,0.277645


### Averages and Median of TargetOnly with 11 rounds

In [125]:
all_df = []
for i in range(11):    
    blrr_scores = []
    for source_no in range(1, 9, 2):
        blrr_scores.append(target_only(source_no, source_no+1, qwk_blrr))
    df = pd.DataFrame(blrr_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
    all_df.append(df)

In [126]:
pd.concat(all_df).groupby(['Source', 'Target']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,n=10,n=25,n=50,n=100
Source,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.477333,0.579146,0.594794,0.60404
3,4,0.935049,0.977415,0.982983,0.98528
5,6,0.929928,0.964332,0.971753,0.976172
7,8,0.760971,0.877536,0.893637,0.897103


In [127]:
pd.concat(all_df).groupby(['Source', 'Target']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,n=10,n=25,n=50,n=100
Source,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.470192,0.589259,0.589875,0.603268
3,4,0.949488,0.979273,0.983263,0.985347
5,6,0.937017,0.964417,0.972798,0.976172
7,8,0.792484,0.876679,0.894267,0.89879


# EasyAdapt

Most source code is obtained from easyadapt_sample.ipynb (located in root folder)

Classification model here used is SVM, without the StandardScalar

For adding noise to dataset, source code obtain from easyadapt_sample.ipynb located in root folder

In [31]:
def add_noise(dataset):
    mu = 5
    sigma = 0.5      # for EasyAdapt, p is set to 0.5, refer to Phandi's paper
    nrow = dataset.shape[0]
    ncol = dataset.shape[1]
    for column in range(ncol):
        c_noise = dataset.iloc[:, column] + np.random.normal(mu, sigma, nrow) 
        dataset.iloc[:, column] = c_noise
    return dataset

In [32]:
def qwk_svm_easyadapt(x_train, x_test, y_train, y_test):
    """
    Commented out the preprocess of StandardScaler because it yields negative scores, 
    so decided to use the score given instead
    """
    # Preprocess
    # sc_Xsvm = StandardScaler()
    # sc_ysvm = StandardScaler()
    # x_trainSVM = sc_Xsvm.fit_transform(x_train)
    # y_trainSVM = sc_ysvm.fit_transform(y_train)
    # x_testSVM = sc_Xsvm.transform(x_test)
    # y_testSVM = sc_ysvm.transform(y_test)

    # Fit the model
    from sklearn.svm import SVR
    model_svm = SVR()
    model_svm.fit(x_train, y_train.ravel())

    # Get predicted scores
    y_predSVM = model_svm.predict(x_test)
    y_predSVM = y_predSVM.round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predSVM, weights="quadratic")

    return score

### Preparing training and testing data 

In [33]:
def prepare_training(X_src_train, X_tgt_train, y_src_train, y_tgt_train):
    X1 =  pd.concat([X_src_train.add_prefix('g_'), 
                     X_src_train.add_prefix('s_')], 
                     axis = 1)
    
    X2 =  pd.concat([X_tgt_train.add_prefix('g_'), 
                     X_tgt_train.add_prefix('t_')], 
                     axis = 1)
    
    X_easyadapt_train = pd.concat([X1, X2], axis=0, ignore_index=True).fillna(0)
    y_easyadapt_train = pd.concat([y_src_train, y_tgt_train], axis=0, ignore_index=True)
    
    return X_easyadapt_train, y_easyadapt_train

In [34]:
def prepare_testing(x_easyadapt_train, x_tgt_test, y_tgt_test):
    X3 = pd.DataFrame(columns = x_easyadapt_train.columns)
    X4 = pd.concat([x_tgt_test.add_prefix('g_'), 
                    x_tgt_test.add_prefix('t_')], 
                    axis = 1)
    
    X_easyadapt_test = pd.concat([X3, X4], axis=0, ignore_index=True).fillna(0)
    y_easyadapt_test = y_tgt_test
    
    return X_easyadapt_test, y_easyadapt_test

### EasyAdapt 5-fold for sub-sample of 10,25,50,100

In [35]:
def easyadapt(source_set, target_set):
    source_file = 'features/features_set' + str(source_set) + '.csv'
    source_dataset = pd.read_csv(source_file)

    target_file = 'features/features_set' + str(target_set) + '.csv'
    target_dataset = pd.read_csv(target_file)
    
    # add noise to target data
    X_tgt = target_dataset.iloc[:,:12]
    X_tgt = add_noise(X_tgt)
    y_tgt = target_dataset.iloc[:,14]
    
    # Apply 5-fold 
    cv = KFold(n_splits=5)
    sizes = [10, 25, 50, 100]
    all_scores = []
    
    for train_index, test_index in cv.split(X_tgt):
        
        # Extract one fold of target-testing data
        X_tgt_test, y_tgt_test = X_tgt.iloc[test_index], y_tgt.iloc[test_index]
        
        # Sub-sample four folds of target-training data
        sub_sample_index = []
        scores = []
        
        for size in sizes:
            
            # Obtain target-training data
            sub_sample_index = sub_sample(train_index, size, sub_sample_index)
            X_tgt_train, y_tgt_train = X_tgt.iloc[sub_sample_index], y_tgt.iloc[sub_sample_index]
            
            # Obtain source-training data
            X_src_train = source_dataset.iloc[:,:12]
            y_src_train = source_dataset.iloc[:,14]
        
            # Prepare training data by combining source-training and target-training data
            X_src_train = source_dataset.iloc[:,:12]
            y_src_train = source_dataset.iloc[:,14]
            X_easyadapt_train, y_easyadapt_train = prepare_training(X_src_train, X_tgt_train, y_src_train, y_tgt_train)
            
            # Prepare testing data from target-testing data
            X_easyadapt_test, y_easyadapt_test = prepare_testing(X_easyadapt_train, X_tgt_test, y_tgt_test)
            
            # Reshape data and model
            X_easyadapt_train = X_easyadapt_train.values.astype(float)
            y_easyadapt_train = y_easyadapt_train.values.astype(float)
            y_easyadapt_train = y_easyadapt_train.reshape(-1,1)

            X_easyadapt_test = X_easyadapt_test.values.astype(float)
            y_easyadapt_test = y_easyadapt_test.values.astype(float)
            y_easyadapt_test = y_easyadapt_test.reshape(-1,1)
            
            # Obtain QWK score
            score = qwk_svm_easyadapt(X_easyadapt_train, X_easyadapt_test, y_easyadapt_train, y_easyadapt_test)
            scores.append(score)
            
        all_scores.append(scores)
            
    # Averaging the qwk scores for the 5 folds
    averages = np.array(all_scores).mean(axis=0)
    
    return source_set, target_set, averages[0], averages[1], averages[2], averages[3]

In [136]:
easyadapt_scores = []
for source_no in range(1, 9, 2):
    easyadapt_scores.append(easyadapt(source_no, source_no+1))

In [137]:
easyadapt_scores_df = pd.DataFrame(easyadapt_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
easyadapt_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.376151,0.415037,0.480359,0.476847
1,3,4,0.001353,0.00147,0.00501,0.037694
2,5,6,0.298715,0.354942,0.430873,0.586012
3,7,8,0.036074,0.062309,0.101659,0.183932


### Averages and Median of EasyAdapt with 11 rounds

In [142]:
all_df = []
for i in range(11):
    easyadapt_scores = []
    for source_no in range(1, 9, 2):
        easyadapt_scores.append(easyadapt(source_no, source_no+1))
    df = pd.DataFrame(easyadapt_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
    all_df.append(df)

In [143]:
pd.concat(all_df).groupby(['Source', 'Target']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,n=10,n=25,n=50,n=100
Source,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.264888,0.415119,0.478854,0.486279
3,4,0.003076,0.005819,0.01195,0.040827
5,6,0.301382,0.353393,0.439346,0.580472
7,8,0.042739,0.060824,0.097947,0.178202


In [144]:
pd.concat(all_df).groupby(['Source', 'Target']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,n=10,n=25,n=50,n=100
Source,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.262844,0.39623,0.48643,0.485995
3,4,0.002552,0.005683,0.012173,0.041824
5,6,0.301333,0.352299,0.436198,0.580516
7,8,0.043421,0.058019,0.094139,0.177103


# Concat

In [132]:
def add_noise_concat(dataset):
    mu = 5
    sigma = 1.0      # for Concat, p is set to 1, refer to Phandi's paper
    nrow = dataset.shape[0]
    ncol = dataset.shape[1]
    for column in range(ncol):
        c_noise = dataset.iloc[:, column] + np.random.normal(mu, sigma, nrow) 
        dataset.iloc[:, column] = c_noise
    return dataset

### Concat 5-fold for sub-sample of 10,25,50,100

In [139]:
def concat(source_set, target_set):
    source_file = 'features/features_set' + str(source_set) + '.csv'
    source_dataset = pd.read_csv(source_file)

    target_file = 'features/features_set' + str(target_set) + '.csv'
    target_dataset = pd.read_csv(target_file)
    
    # add noise to target data
    X_tgt = target_dataset.iloc[:,:12]
    X_tgt = add_noise_concat(X_tgt)     # adopted from EasyAdapt, changed here
    y_tgt = target_dataset.iloc[:,14]
    
    # Apply 5-fold 
    cv = KFold(n_splits=5)
    sizes = [10, 25, 50, 100]
    all_scores = []
    
    for train_index, test_index in cv.split(X_tgt):
        
        # Extract one fold of target-testing data
        X_tgt_test, y_tgt_test = X_tgt.iloc[test_index], y_tgt.iloc[test_index]
        
        # Sub-sample four folds of target-training data
        sub_sample_index = []
        scores = []
        
        for size in sizes:
            
            # Obtain target-training data
            sub_sample_index = sub_sample(train_index, size, sub_sample_index)
            X_tgt_train, y_tgt_train = X_tgt.iloc[sub_sample_index], y_tgt.iloc[sub_sample_index]
            
            # Obtain source-training data
            X_src_train = source_dataset.iloc[:,:12]
            y_src_train = source_dataset.iloc[:,14]
        
            # Prepare training data by combining source-training and target-training data
            X_src_train = source_dataset.iloc[:,:12]
            y_src_train = source_dataset.iloc[:,14]
            X_concat_train, y_concat_train = prepare_training(X_src_train, X_tgt_train, y_src_train, y_tgt_train)
            
            # Prepare testing data from target-testing data
            X_concat_test, y_concat_test = prepare_testing(X_concat_train, X_tgt_test, y_tgt_test)
            
            # Reshape data and model
            X_concat_train = X_concat_train.values.astype(float)
            y_concat_train = y_concat_train.values.astype(float)
            y_concat_train = y_concat_train.reshape(-1,1)

            X_concat_test = X_concat_test.values.astype(float)
            y_concat_test = y_concat_test.values.astype(float)
            y_concat_test = y_concat_test.reshape(-1,1)
            
            # Obtain QWK score
            score = qwk_svm_easyadapt(X_concat_train, X_concat_test, y_concat_train, y_concat_test)
            scores.append(score)
            
        all_scores.append(scores)
            
    # Averaging the qwk scores for the 5 folds
    averages = np.array(all_scores).mean(axis=0)
    
    return source_set, target_set, averages[0], averages[1], averages[2], averages[3]

In [140]:
concat_scores = []
for source_no in range(1, 9, 2):
    concat_scores.append(concat(source_no, source_no+1))

In [141]:
concat_scores_df = pd.DataFrame(concat_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
concat_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.292901,0.431105,0.520595,0.494084
1,3,4,0.004714,0.00671,0.014554,0.045623
2,5,6,0.29604,0.344871,0.423413,0.576497
3,7,8,0.04036,0.068491,0.103795,0.175286


### Averages and Median of Concat with 11 rounds

In [145]:
all_df = []
for i in range(11):
    concat_scores = []
    for source_no in range(1, 9, 2):
        concat_scores.append(concat(source_no, source_no+1))
    df = pd.DataFrame(concat_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
    all_df.append(df)

In [146]:
pd.concat(all_df).groupby(['Source', 'Target']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,n=10,n=25,n=50,n=100
Source,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.327082,0.437958,0.492675,0.492684
3,4,0.002901,0.004907,0.011109,0.040046
5,6,0.300597,0.347977,0.430188,0.577973
7,8,0.045351,0.059748,0.099863,0.175944


In [147]:
pd.concat(all_df).groupby(['Source', 'Target']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,n=10,n=25,n=50,n=100
Source,Target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,0.325597,0.453338,0.499309,0.496174
3,4,0.00225,0.005426,0.011794,0.041495
5,6,0.29983,0.353102,0.430996,0.579019
7,8,0.045059,0.057037,0.092837,0.176969
