# This notebook generates all the QWK scores for set 1 to 8 using the features csv files found in 'features' directory with the include of domain adapation

All implementation are obtained from maes.ipynb

Domain adaptation methods used below: 
- SourceOnly 
- TargetOnly
- EasyAdapt 

In [1]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm #SVR is in SVM
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split, KFold

# NB, SVM and BLRR model qwk scores generation 

Put here for referencing purpose

In [2]:
def qwk_nb(x_train, x_test, y_train, y_test):
    # Preprocess
    x_trainNB = x_train
    y_trainNB = y_train
    x_testNB = x_test
    y_testNB = y_test

    # Fit the model
    model_nb = naive_bayes.MultinomialNB()
    model_nb.fit(x_trainNB, y_trainNB.ravel())
    
    # Get predicted scores
    y_predNB = model_nb.predict(x_testNB)
    
    # Get QWK score
    score = cohen_kappa_score(y_test, y_predNB, weights="quadratic")
    return score, y_predNB

In [3]:
def qwk_svm(x_train, x_test, y_train, y_test):
    # Preprocess
    sc_Xsvm = StandardScaler()
    sc_ysvm = StandardScaler()
    x_trainSVM = sc_Xsvm.fit_transform(x_train)
    y_trainSVM = sc_ysvm.fit_transform(y_train)
    x_testSVM = sc_Xsvm.transform(x_test)
    y_testSVM = sc_ysvm.transform(y_test)
    
    # Fit the model
    from sklearn.svm import SVR
    model_svm = SVR(kernel='rbf', gamma='auto', verbose=True)
    model_svm.fit(x_trainSVM, y_trainSVM.ravel())
    
    # Get predicted scores
    y_predSVM = model_svm.predict(x_testSVM)
    y_predSVM = sc_ysvm.inverse_transform(y_predSVM).round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predSVM, weights="quadratic")
    return score, y_predSVM

In [4]:
def qwk_blrr(x_train, x_test, y_train, y_test):
    # Preprocess
    sc_Xblrr = StandardScaler()
    sc_yblrr = StandardScaler()
    x_trainBLRR = sc_Xblrr.fit_transform(x_train)
    y_trainBLRR = sc_yblrr.fit_transform(y_train)
    x_testBLRR = sc_Xblrr.transform(x_test)
    y_testBLRR = sc_yblrr.transform(y_test)
    
    # Fit the model
    from sklearn import linear_model
    model_blrr = linear_model.BayesianRidge()
    model_blrr.fit(x_trainBLRR, y_trainBLRR.ravel())
    
    # Get predicted scores
    y_predBLRR = model_blrr.predict(x_testBLRR)
    y_predBLRR = sc_yblrr.inverse_transform(y_predBLRR).round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predBLRR, weights="quadratic")
    return score, y_predBLRR

# SourceOnly

Disclaimer: not too sure of the algorithm used by the SourceOnly to obtain the QWK score, the approach below tries to use all 3 methods to obtain the scores, but have some difference with the Phandi's scores

Since target set is not used, so 4-fold cross validation (10, 25, 50, 100) can't be used

Probably needs some refinement

In [5]:
def source_only(source_set, target_set):
    # target_set is omitted as only source is used
    source_file = 'features/features_set' + str(source_set) + '.csv'
    source_dataset = pd.read_csv(source_file)
    
    # Reshape data and model (source)
    X_train = source_dataset.iloc[:,:15].values.astype(float)
    y_train = source_dataset.iloc[:,17].values.astype(float)
    y_train = np.array(y_train).reshape(-1,1)
 
    # Getting the QWK scores for all methods
    nb_score, _ = qwk_nb(X_train, X_train, y_train, y_train)
    svm_score, _ = qwk_svm(X_train, X_train, y_train, y_train)
    blrr_score, _ = qwk_blrr(X_train, X_train, y_train, y_train)
    
    return source_set, target_set, nb_score, svm_score, blrr_score

In [6]:
scores = []
for source_no in range(1, 9, 2):
    scores.append(source_only(source_no, source_no+1))

[LibSVM][LibSVM][LibSVM][LibSVM]

In [7]:
scores_df = pd.DataFrame(scores, columns=['Source', 'Target', 'BLRR', 'SVM', 'NB'])
scores_df

Unnamed: 0,Source,Target,BLRR,SVM,NB
0,1,2,0.752705,0.854028,0.810284
1,3,4,0.641433,0.724488,0.648126
2,5,6,0.743947,0.833932,0.778948
3,7,8,0.692859,0.811989,0.705152


# TargetOnly

In [8]:
"""
Function to generate sub samples of 10, 25, 50, 100
"""
def sub_sample(data_set, size, selected_set):
    net_size = size - len(selected_set)
    for i in range(net_size):
        found = False
        while not found:
            index = random.choice(data_set)
            if index not in selected_set:
                selected_set.append(index)
                found = True
    return selected_set

In [9]:
"""
Function to obtain the qwk scores of 5 folds for target only based on the given algo (the model to be used)
"""
def target_only(source_set, target_set, algo_function):
    # source_set is omitted as only target is used
    target_file = 'features/features_set' + str(target_set) + '.csv'
    target_dataset = pd.read_csv(target_file)
    
    # Reshape data and model (target)
    X = target_dataset.iloc[:,:15].values.astype(float)
    y = target_dataset.iloc[:,17].values.astype(float)
    y = np.array(y).reshape(-1,1)
    
    # Apply 5-fold 
    cv = KFold(n_splits=5)
    sizes = [10, 25, 50, 100]
    all_scores = []
    
    # Split into train and test for the 5-fold
    for train_index, test_index in cv.split(X):
        
        # Extract one fold of testing data
        X_test, y_test = X[test_index], y[test_index]

        # Sub-sample four folds of training data
        sub_sample_index = []
        scores = []
        for size in sizes:
            sub_sample_index = sub_sample(train_index, size, sub_sample_index)
            X_train, y_train = X[sub_sample_index], y[sub_sample_index]
            
            # Getting the QWK scores for the given algo function
            score, _ = algo_function(X_train, X_test, y_train, y_test)
            scores.append(score)
            
        all_scores.append(scores)
    
    # Averaging the qwk scores for the 5 folds
    averages = np.array(all_scores).mean(axis=0)
    
    return source_set, target_set, averages[0], averages[1], averages[2], averages[3]

### TargetOnly for BLRR, SVM and NB methods

Not too sure which approach was used in the Phandi's paper hence decided to try all approaches. 

Can further refine if needed.

In [10]:
blrr_scores = []
svm_scores = []
nb_scores = []
for source_no in range(1, 9, 2):
    blrr_scores.append(target_only(source_no, source_no+1, qwk_blrr))
    svm_scores.append(target_only(source_no, source_no+1, qwk_svm))
    nb_scores.append(target_only(source_no, source_no+1, qwk_nb))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [11]:
blrr_scores_df = pd.DataFrame(blrr_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
blrr_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.446515,0.55191,0.581513,0.611588
1,3,4,0.535292,0.635263,0.600057,0.63384
2,5,6,0.479181,0.577733,0.585821,0.632691
3,7,8,0.43029,0.52272,0.562998,0.610911


In [12]:
svm_scores_df = pd.DataFrame(svm_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
svm_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.25194,0.429193,0.474331,0.522236
1,3,4,0.345396,0.48975,0.578151,0.600809
2,5,6,0.200154,0.352701,0.413846,0.490666
3,7,8,0.135212,0.356849,0.486679,0.572888


In [13]:
nb_scores_df = pd.DataFrame(nb_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
nb_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.368109,0.539057,0.541,0.587259
1,3,4,0.563427,0.610877,0.65551,0.69088
2,5,6,0.42916,0.576719,0.606874,0.629351
3,7,8,0.282465,0.320242,0.415089,0.502998


# EasyAdapt

Most source code is obtained from easyadapt_sample.ipynb (located in root folder)

Classification model here used is SVM, without the StandardScalar

For adding noise to dataset, source code obtain from easyadapt_sample.ipynb located in root folder

In [14]:
def add_noise(dataset):
    mu = 5
    sigma = 0.5      # for EasyAdapt, p is set to 0.5, refer to Phandi's paper
    nrow = dataset.shape[0]
    ncol = dataset.shape[1]
    for column in range(ncol):
        c_noise = dataset.iloc[:, column] + np.random.normal(mu, sigma, nrow) 
        dataset.iloc[:, column] = c_noise
    return dataset

In [15]:
def qwk_svm_easyadapt(x_train, x_test, y_train, y_test):
    """
    Commented out the preprocess of StandardScaler because it yields negative scores, 
    so decided to use the score given instead
    """
    # Preprocess
    # sc_Xsvm = StandardScaler()
    # sc_ysvm = StandardScaler()
    # x_trainSVM = sc_Xsvm.fit_transform(x_train)
    # y_trainSVM = sc_ysvm.fit_transform(y_train)
    # x_testSVM = sc_Xsvm.transform(x_test)
    # y_testSVM = sc_ysvm.transform(y_test)

    # Fit the model
    from sklearn.svm import SVR
    model_svm = SVR()
    model_svm.fit(x_train, y_train.ravel())

    # Get predicted scores
    y_predSVM = model_svm.predict(x_test)
    y_predSVM = y_predSVM.round()

    # Get QWK score
    score = cohen_kappa_score(y_test, y_predSVM, weights="quadratic")

    return score

### Preparing training and testing data 

In [16]:
def prepare_training(X_src_train, X_tgt_train, y_src_train, y_tgt_train):
    X1 =  pd.concat([X_src_train.add_prefix('g_'), 
                     X_src_train.add_prefix('s_')], 
                     axis = 1)
    
    X2 =  pd.concat([X_tgt_train.add_prefix('g_'), 
                     X_tgt_train.add_prefix('t_')], 
                     axis = 1)
    
    X_easyadapt_train = pd.concat([X1, X2], axis=0, ignore_index=True).fillna(0)
    y_easyadapt_train = pd.concat([y_src_train, y_tgt_train], axis=0, ignore_index=True)
    
    return X_easyadapt_train, y_easyadapt_train

In [17]:
def prepare_testing(x_easyadapt_train, x_tgt_test, y_tgt_test):
    X3 = pd.DataFrame(columns = x_easyadapt_train.columns)
    X4 = pd.concat([x_tgt_test.add_prefix('g_'), 
                    x_tgt_test.add_prefix('t_')], 
                    axis = 1)
    
    X_easyadapt_test = pd.concat([X3, X4], axis=0, ignore_index=True).fillna(0)
    y_easyadapt_test = y_tgt_test
    
    return X_easyadapt_test, y_easyadapt_test

### EasyAdapt 5-fold for sub-sample of 10,25,50,100

In [18]:
def easyadapt(source_set, target_set):
    source_file = 'features/features_set' + str(source_set) + '.csv'
    source_dataset = pd.read_csv(source_file)

    target_file = 'features/features_set' + str(target_set) + '.csv'
    target_dataset = pd.read_csv(target_file)
    
    # add noise to target data
    X_tgt = target_dataset.iloc[:,:15]
    X_tgt = add_noise(X_tgt)
    y_tgt = target_dataset.iloc[:,17]
    
    # Apply 5-fold 
    cv = KFold(n_splits=5)
    sizes = [10, 25, 50, 100]
    all_scores = []
    
    for train_index, test_index in cv.split(X_tgt):
        
        # Extract one fold of target-testing data
        X_tgt_test, y_tgt_test = X_tgt.iloc[test_index], y_tgt.iloc[test_index]
        
        # Sub-sample four folds of target-training data
        sub_sample_index = []
        scores = []
        
        for size in sizes:
            
            # Obtain target-training data
            sub_sample_index = sub_sample(train_index, size, sub_sample_index)
            X_tgt_train, y_tgt_train = X_tgt.iloc[sub_sample_index], y_tgt.iloc[sub_sample_index]
            
            # Obtain source-training data
            X_src_train = source_dataset.iloc[:,:15]
            y_src_train = source_dataset.iloc[:,17]
            
            # Prepare training data by combining source-training and target-training data
            X_src_train = source_dataset.iloc[:,:15]
            y_src_train = source_dataset.iloc[:,17]
            X_easyadapt_train, y_easyadapt_train = prepare_training(X_src_train, X_tgt_train, y_src_train, y_tgt_train)
            
            # Prepare testing data from target-testing data
            X_easyadapt_test, y_easyadapt_test = prepare_testing(X_easyadapt_train, X_tgt_test, y_tgt_test)
            
            # Reshape data and model
            X_easyadapt_train = X_easyadapt_train.values.astype(float)
            y_easyadapt_train = y_easyadapt_train.values.astype(float)
            y_easyadapt_train = y_easyadapt_train.reshape(-1,1)

            X_easyadapt_test = X_easyadapt_test.values.astype(float)
            y_easyadapt_test = y_easyadapt_test.values.astype(float)
            y_easyadapt_test = y_easyadapt_test.reshape(-1,1)
            
            # Obtain QWK score
            score = qwk_svm_easyadapt(X_easyadapt_train, X_easyadapt_test, y_easyadapt_train, y_easyadapt_test)
            scores.append(score)
            
        all_scores.append(scores)
            
    # Averaging the qwk scores for the 5 folds
    averages = np.array(all_scores).mean(axis=0)
    
    return source_set, target_set, averages[0], averages[1], averages[2], averages[3]

In [19]:
easyadapt_scores = []
for source_no in range(1, 9, 2):
    easyadapt_scores.append(easyadapt(source_no, source_no+1))

In [20]:
easyadapt_scores_df = pd.DataFrame(easyadapt_scores, columns=['Source', 'Target', 'n=10', 'n=25', 'n=50', 'n=100'])
easyadapt_scores_df

Unnamed: 0,Source,Target,n=10,n=25,n=50,n=100
0,1,2,0.344211,0.438998,0.506372,0.487062
1,3,4,0.612535,0.644616,0.628283,0.633504
2,5,6,0.593338,0.59151,0.569038,0.603321
3,7,8,0.039133,0.214019,0.336371,0.465422
