# XGBoost BoW Modelling
- Data
    - Domain 1
    - Domain 2
    - Domain 1 & 2

- Data Processing
    - Sequence
    - Bag of Words
        - with n-gram
    - TFIDF
        - with n-gram

- Parameters
    - Tree Depth

- Validation
    - 10-fold cross validation
    
- Visualisation
    - Accuracy Graph
    - Confusion Matrix


In [11]:
# from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
import numpy as np
import json

## 1. Load Data

In [12]:
def load_data(index):
    text = []
    label = []

    if index == 1 or index == 3:
        with open("../../data/domain1_train.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])
                label.append(data["label"])

    if index == 2 or index == 3:
        with open("../../data/domain2_train.json") as f:
            for line in f:
                # read line by line
                data = json.loads(line)

                # add values
                text.append(data["text"])
                label.append(data["label"])

    # print(f"Domain{index} length:", len(text))
    
    return text, label


In [13]:
print("Domain1 length:", len(load_data(1)[0]))
print("Domain2 length:", len(load_data(2)[0]))
print("Domain1&2 length:", len(load_data(3)[0]))

Domain1 length: 19500
Domain2 length: 14900
Domain1&2 length: 34400


## 2. Data Preprocessing

#### Create sample data for making vertor space

In [14]:
def vector_sample(text):
    # Number of words: 5000
    sample = [np.arange(5000).tolist()]

    # For n-gram
    sample += text

    return sample

In [15]:
# Data domain
d = 1
print(f"sample size for D{d}:", len(vector_sample(load_data(d)[0])))

sample size for D1: 19501


#### Vectorize the data

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

def toStr(n):
   return str(n)

def count_vec(vector_sample, text, n_features):
   # Create a Vectorizer Object
   vectorizer = CountVectorizer(preprocessor= toStr, analyzer="word", token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 3), max_features=n_features)

   vectorizer.fit(vector_sample)

   # Printing the identified Unique words along with their indices
   # print("Vocabulary: ", vectorizer.vocabulary_)

   # Encode the Document
   vector = vectorizer.transform(text)

   # Summarizing the Encoded Texts
   # print("Encoded Document is:")
   # print(vector.toarray())

   return vector


In [17]:
# Data domain
d = 1
count_vec(vector_sample(load_data(d)[0]), load_data(d)[0], 10000).toarray().shape

(19500, 10000)

## 3. Set Parameters & Train the models

In [18]:
DOMAINS = range(1, 4)
N_FEATURES = range(5000, 20000, 5000)
T_DEPTHS = range(1, 10, 2)


In [19]:
def BoW_training(domain, features, t_depth):
    # BoW
    d = domain
    f = features
    t = t_depth
    text, label = load_data(d)
    sample = vector_sample(text)
    X = count_vec(sample, text, f).toarray()
    y = np.array(label).ravel()

    clf = XGBClassifier(max_depth=t, random_state=0, objective='binary:logistic')
    clf.fit(X, y)

    return clf, X, y

## 4. Validation

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import pandas as pd

n = 10
cv_scores = []
r2_scores = []
c_matrix = []


for d in DOMAINS:
    for f in N_FEATURES:
        for t in T_DEPTHS:
            # For BoW
            clf, X, y = BoW_training(d, f, t)

            print(f"For Domain{d} & {f} Features & {t} Depth:")
            print("Shape of X:", X.shape)
            print("Shape of y:", y.shape)
    
            # r2 score
            y_pred = clf.predict(X)
            r2_scores.append(r2_score(y, y_pred))
            print("R2 Score\n", r2_scores[-1])

            # Confusion matrix
            c_matrix.append(confusion_matrix(y, y_pred))
            print("Confusion Matirx\n", c_matrix[-1])

            # Cross-validation score
            cv_scores.append(cross_val_score(clf, X, y, cv=n))
            print("CV Score\n", cv_scores[-1], "\n")

            # To csv
            cv_result = pd.DataFrame([r2_scores[-1]]+[sum(cv_scores[-1])/n]+cv_scores[-1].tolist(), index=["r2", "cv_avg"]+[f"{i}" for i in range(1,11)], columns=["scores"])
            cv_result.to_csv(f"../../data/results/XGB/XGB-CV-D{d}_F{f}T{t}.csv")
            cm_result = pd.DataFrame({"tn": [c_matrix[-1][0][0]], "fn": [c_matrix[-1][1][0]], "tp": [c_matrix[-1][1][1]], "fp": [c_matrix[-1][0][1]]})
            cm_result.to_csv(f"../../data/results/XGB/XGB-CM-D{d}_F{f}T{t}.csv")


For Domain1 & 5000 Features & 1 Depth:
Shape of X: (19500, 5000)
Shape of y: (19500,)
R2 Score
 0.6888205128205128
Confusion Matirx
 [[8366 1384]
 [ 133 9617]]
CV Score
 [0.90717949 0.92205128 0.92666667 0.91333333 0.92512821 0.92615385
 0.92717949 0.92410256 0.91384615 0.91846154] 

For Domain1 & 5000 Features & 3 Depth:
Shape of X: (19500, 5000)
Shape of y: (19500,)
R2 Score
 0.7727179487179487
Confusion Matirx
 [[8752  998]
 [ 110 9640]]
CV Score
 [0.91846154 0.93538462 0.93897436 0.93076923 0.93948718 0.93641026
 0.94051282 0.93538462 0.92974359 0.93384615] 

For Domain1 & 5000 Features & 5 Depth:
Shape of X: (19500, 5000)
Shape of y: (19500,)
R2 Score
 0.8129230769230769
Confusion Matirx
 [[8920  830]
 [  82 9668]]
CV Score
 [0.92       0.94461538 0.94205128 0.93538462 0.94615385 0.93692308
 0.94461538 0.94205128 0.93282051 0.93692308] 

For Domain1 & 5000 Features & 7 Depth:
Shape of X: (19500, 5000)
Shape of y: (19500,)
R2 Score
 0.8644102564102564
Confusion Matirx
 [[9139  611]

KeyboardInterrupt: 

## 5. Visualisation

In [None]:
y_pred
y

array([1, 1, 1, ..., 0, 0, 0])