# XGBoost Modelling
- Data
    - Domain 1
    - Domain 2
    - Domain 1 & 2

- Data Processing
    - Sequence
    - Bag of Words
        - with n-gram
    - TFIDF
        - with n-gram

- Parameters
    - Tree Depth

- Validation
    - 10-fold cross validation
    
- Visualisation
    - Accuracy Graph
    - Confusion Matrix


In [1]:
# from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
import numpy as np
import json

## 1. Load Data

In [2]:
def load_data(index):
    MACHINE1 = 7
    HUMAN = 8
    text = []
    label = []
    model = []

    if index == 1 or index == 3:
        with open("../../data/domain1_train.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])
                label.append(data["label"])
                if data["label"] == 0:
                    model.append(MACHINE1)
                else:
                    model.append(HUMAN)

    if index == 2 or index == 3:
        with open("../../data/domain2_train.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])
                label.append(data["label"])
                if data["label"] == 0:
                    model.append(data["model"])
                else:
                    model.append(HUMAN)
            
    if index == 4:
        with open("../../data/test_set.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])      


    # print(f"Domain{index} length:", len(text))
    
    return text, label, model


In [3]:
print("Domain1 length:", len(load_data(1)[0]))
print("Domain2 length:", len(load_data(2)[0]))
print("Domain1&2 length:", len(load_data(3)[0]))

Domain1 length: 19500
Domain2 length: 14900
Domain1&2 length: 34400


In [4]:
# import pandas as pd

text, label, model = load_data(3)

# df3 = pd.DataFrame({"text":text, "label":label, "model":model})

## 2. Data Preprocessing

#### Create sample data for making vertor space

In [5]:
# def vector_sample(text):
#     # Number of words: 5000
#     sample = [np.arange(5000).tolist()]

#     # For n-gram
#     sample += text

#     return sample

In [6]:
# # Data domain
# d = 1
# print(f"sample size for D{d}:", len(vector_sample(load_data(d)[0])))

#### Vectorize the data

In [7]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def toStr(n):
   return str(n)

def count_vec(text, n_features):
   # Create a Vectorizer Object
   vectorizer = TfidfVectorizer(preprocessor= toStr, analyzer="word", token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 3), max_features=n_features)

   vectorizer.fit(text)

   # Printing the identified Unique words along with their indices
   # print("Vocabulary: ", vectorizer.vocabulary_)

   # Encode the Document
   vector = vectorizer.transform(text)

   # vector.get_stop_words()

   # Summarizing the Encoded Texts
   # print("Encoded Document is:")
   # print(vector.toarray())

   return vector, vectorizer


In [8]:
# Data domain
d = 1
count_vec(load_data(d)[0], 10000)[0].toarray().shape

(19500, 10000)

## 3. Set Parameters & Train the models

In [9]:
DOMAINS = range(1, 4)
N_FEATURES = range(5000, 20000, 5000)
T_DEPTHS = range(1, 10, 2)


In [10]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

def tfidf_training(domain, features, t_depth):
    # Tf-idf
    d = domain
    f = features
    t = t_depth
    text, label, model = load_data(d)
    text, text_test, model, model_test = train_test_split(text, model, test_size=0.2, random_state=42)

    X, vectorizer = count_vec(text, f)
    X = X.toarray()
    # y = np.array(label).ravel()
    y = np.array(model).ravel()

    # Resampling
    print(f'Original dataset shape {X.shape}')
    print(f'Original dataset samples per class {Counter(y)}')

    # Balanced models data
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    print(f'Resampled dataset samples per class {Counter(y_res)}')

    # Balanced Machine vs human
    y_res = y_res == 8
    y_res = y_res.astype(int)
    
    # X_res, y_res = sm.fit_resample(X_res, y_res)
    # print(f'Resampled dataset samples per class {Counter(y_res)}')
    
    clf = XGBClassifier(max_depth=t, random_state=0, objective='binary:logistic')
    clf.fit(X_res, y_res)

    X_test = vectorizer.transform(text_test).toarray()
    y_test = np.array(model_test).ravel() == 8
    y_test = y_test.astype(int)
    return clf, X_test, y_test

## 4. Validation

In [11]:
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score
# import pandas as pd

# n = 10
# # cv_scores = []
# # r2_scores = []
# acc_scores = []
# c_matrix = []


# for d in [3]: #DOMAINS:
#     for f in [15000]:#N_FEATURES:
#         for t in [5]:#T_DEPTHS:
#             # For BoW
#             clf, X, y = tfidf_training(d, f, t)

#             print(f"For Domain{d} & {f} Features & {t} Depth:")
#             print("Shape of X:", X.shape)
#             print("Shape of y:", y.shape)
    
#             # acc score for categorical ddata
#             y_pred = clf.predict(X)
#             acc_scores.append(accuracy_score(y, y_pred))
#             print("Accuracy Score\n", acc_scores[-1])

#             # Confusion matrix
#             c_matrix.append(confusion_matrix(y, y_pred))
#             print("Confusion Matirx\n", c_matrix[-1])

#             # Cross-validation score
#             # cv_scores.append(cross_val_score(clf, X, y, cv=n))
#             # print("CV Score\n", cv_scores[-1], "\n")

#             # To csv
#             cv_result = pd.DataFrame([acc_scores[-1]], index=["acc"], columns=["scores"])
#             cv_result.to_csv(f"../../data/results/XGB_mc/XGB-CV-D{d}_F{f}T{t}-tfidf.csv")
#             cm_result = pd.DataFrame(c_matrix[-1])
#             cm_result.to_csv(f"../../data/results/XGB_mc/XGB-CM-D{d}_F{f}T{t}-tfidf.csv")


In [12]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

def tfidf_training_kaggle(domain, features, t_depth):
    # Tf-idf
    d = domain
    f = features
    t = t_depth
    text, label, model = load_data(d)
    # text, text_test, model, model_test = train_test_split(text, model, test_size=0.2, random_state=42)

    X, vectorizer = count_vec(text, f)
    X = X.toarray()
    # y = np.array(label).ravel()
    y = np.array(model).ravel()

    # Resampling
    print(f'Original dataset shape {X.shape}')
    print(f'Original dataset samples per class {Counter(y)}')

    # Balanced models data
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    print(f'Resampled dataset samples per class {Counter(y_res)}')

    # Balanced Machine vs human
    y_res = y_res == 8
    y_res = y_res.astype(int)
    
    # X_res, y_res = sm.fit_resample(X_res, y_res)
    # print(f'Resampled dataset samples per class {Counter(y_res)}')
    
    clf = XGBClassifier(max_depth=t, random_state=0, objective='binary:logistic')
    clf.fit(X_res, y_res)

    text_test, _, _ = load_data(4)
    X_test = vectorizer.transform(text_test)
    y_pred = clf.predict(X_test)

    print(y_pred)

    result = pd.DataFrame({"class":y_pred}).reset_index().rename(columns = {'index':'id'})
    result.to_csv("./result_XGB_res_models.csv", index=False)
    

    return clf, X_res, y_res

In [13]:
for d in [3]: #DOMAINS:
    for f in [15000]:#N_FEATURES:
        for t in [5]:#T_DEPTHS:
            # For tfidf
            tfidf_training_kaggle(d, f, t)

Original dataset shape (34400, 15000)
Original dataset samples per class Counter({8: 11900, 7: 9750, 0: 2364, 3: 2358, 1: 2357, 2: 2339, 6: 1763, 4: 789, 5: 780})
Resampled dataset samples per class Counter({8: 11900, 7: 11900, 1: 11900, 5: 11900, 3: 11900, 4: 11900, 0: 11900, 6: 11900, 2: 11900})


: 

## 5. Visualisation

In [None]:
# y_pred
# y
# 0 vs 1 
# tfidf


array([1, 1, 1, ..., 0, 0, 0])