# XGBoost Modelling
- Data
    - Domain 1
    - Domain 2
    - Domain 1 & 2

- Data Processing
    - Sequence
    - Bag of Words
        - with n-gram
    - TFIDF
        - with n-gram

- Parameters
    - Tree Depth

- Validation
    - 5-folds cross validation
    
- Visualisation
    - Accuracy Graph
    - Confusion Matrix
    - ROC


In [1]:
from xgboost import XGBClassifier
import numpy as np
import json

## 1. Load Data

In [10]:
def load_data(index):
    MACHINE = 0
    HUMAN = 1
    HUMAN2 = 9
    text = []
    label = []
    model = []

    if index == 1 or index == 3:
        with open("../../data/domain1_train.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])
                label.append(data["label"])
                if data["label"] == 0:
                    label.append(MACHINE)
                    model.append(MACHINE)
                else:
                    label.append(HUMAN)
                    model.append(HUMAN)

    if index == 2 or index == 3:
        with open("../../data/domain2_train.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])
                label.append(data["label"])
                if data["label"] == 0:
                    label.append(MACHINE)
                    model.append(data["model"]+2)
                else:
                    label.append(HUMAN)
                    model.append(HUMAN2)
            
    if index == 4:
        with open("../../data/test_set.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])      
    
    return text, label, model


In [11]:
print("Domain1 length:", len(load_data(1)[0]))
print("Domain2 length:", len(load_data(2)[0]))
print("Domain1&2 length:", len(load_data(3)[0]))

Domain1 length: 19500
Domain2 length: 14900
Domain1&2 length: 34400


In [9]:
# import pandas as pd
# df = pd.DataFrame({"text":text, "label":label, "model":model})

text, label, model = load_data(3)

## 2. Data Preprocessing

#### Vectorize the data

In [12]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def toStr(n):
   return str(n)

def count_vec(text, n_features):
   # Create a Vectorizer Object
   vectorizer = TfidfVectorizer(preprocessor= toStr, analyzer="word", token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 3), max_features=n_features)
   vectorizer.fit(text)
   
   # Printing the identified Unique words along with their indices
   # print("Vocabulary: ", vectorizer.vocabulary_)

   # Encode the Document
   vector = vectorizer.transform(text)


   # Summarizing the Encoded Texts
   # print("Encoded Document is:")
   # print(vector.toarray())

   return vector, vectorizer


## 3. Set Parameters & Train the models

In [13]:
DOMAINS = range(1, 4)
N_FEATURES = range(5000, 20000, 5000)
T_DEPTHS = range(1, 10, 2)


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter
import pandas as pd
import os

def tfidf_training(domain, features, t_depth):
    # Tf-idf
    d = domain
    f = features
    t = t_depth
    text, label, model = load_data(d)

    # Vectorize the text data
    X, vectorizer = count_vec(text, f)
    X = X.toarray()
    y = np.array(model).ravel()

    # Split data to train and test
    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Resampling
    print(f'Original dataset shape {X.shape}')
    print(f'Original dataset samples per class {Counter(y)}')

    # Balanced models data
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)
    print(f'Resampled dataset samples per class {Counter(y_res)}')

    # Balanced Machine vs human
    for i in range(0, y_res.shape[0]):
        if y_res[i] == 1 or y_res[i] == 9:
            y_res[i] = 1
        else:
            y_res[i] = 0
    
    rus = RandomUnderSampler(random_state=42)
    X_res, y_res = rus.fit_resample(X_res, y_res)
    print(f'Resampled dataset samples per class {Counter(y_res)}')
    
    # Train the XGBoost model
    clf = XGBClassifier(max_depth=t, random_state=0, objective='binary:logistic')

    # Cross-validation scores
    cv_score = cross_val_score(clf, X_res, y_res, cv=5)
    print(cv_score)

    # Confusion Matrix
    clf.fit(X_res, y_res)
    for i in range(0, y_test.shape[0]):
        if y_test[i] == 1 or y_test[i] == 9:
            y_test[i] = 1
        else:
            y_test[i] = 0
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    tn, fp, fn, tp = cm.ravel()

    # Record on csv
    df = pd.DataFrame({
        'n_features': [f], 
        't_depth': [t], 
        '5-fold_cv_score': [sum(cv_score)/len(cv_score)],
        'tn': [tn],
        'fp': [fp],
        'fn': [fn],
        'tp': [tp]
        })
    path = "./XGB_Records.csv"
    exists= not os.path.exists(path)
    df.to_csv(path, index=False, header=exists, mode='a')

    # For Kaggle prediction
    text, _, _ = load_data(4)
    X_test = vectorizer.transform(text)
    y_pred = clf.predict(X_test)

    result = pd.DataFrame({"class":y_pred}).reset_index().rename(columns = {'index':'id'})
    result.to_csv("./result_XGB_predictions.csv", index=False)


    return clf, X_test, y_test

## 4. Visualisation

In [54]:
for d in [3]:
    for f in N_FEATURES:
        for t in T_DEPTHS:
            # For tfidf
            print(f"For domain {d}, {f} features, {t} depth")
            tfidf_training(d, f, t)

For domain 3, 5000 features, 9 depth
Original dataset shape (27520, 5000)
Original dataset samples per class Counter({1: 7795, 0: 7754, 2: 1917, 3: 1894, 5: 1885, 4: 1864, 9: 1714, 8: 1437, 6: 634, 7: 626})
Resampled dataset samples per class Counter({4: 7795, 0: 7795, 8: 7795, 1: 7795, 9: 7795, 3: 7795, 5: 7795, 2: 7795, 6: 7795, 7: 7795})
Resampled dataset samples per class Counter({0: 15590, 1: 15590})
[0.96279666 0.9634381  0.96327774 0.95846697 0.96071199]
[[3888  601]
 [ 342 2049]]
