# KNN

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import json
import pandas as pd

In [2]:
data1 = []
data2 = []
data3 = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data1.append(json.loads(line.strip()))
data_strings1 = [' '.join(map(str, sample['text'])) for sample in data1]


# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))
data_strings2 = [' '.join(map(str, sample['text'])) for sample in data2]

with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data3.append(json.loads(line.strip()))
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data3.append(json.loads(line.strip()))
data_strings4 = [' '.join(map(str, sample['text'])) for sample in data3]


test = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/test_set.json", 'r') as file:
    for line in file:
        test.append(json.loads(line.strip()))


In [3]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

print(df1['label'])

0        1
1        1
2        1
3        1
4        1
        ..
19495    0
19496    0
19497    0
19498    0
19499    0
Name: label, Length: 19500, dtype: int64


## Preprocessing

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Using BoW:
bow_vectorizer = CountVectorizer(max_features=5000)
X_bow = bow_vectorizer.fit_transform(data_strings1)

# Using TFIDF:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data_strings1)

y1 = df1['label']
y2 = df2['label']


## Balancing

In [28]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Random OverSampler
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_bow, y) # Repeat for TFIDF

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_bow, y) # Repeat for TFIDF

# Random UnderSampler
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_bow, y) # Repeat for TFIDF


## Model Building & Evaluation:

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# Create a KNN classifier instance
knn = KNeighborsClassifier()

# Using Random OverSampling as an example:

scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(knn, X_ros, y_ros, cv=5, scoring=scoring)

avg_accuracy_ros = scores['test_accuracy'].mean()
avg_precision_ros = scores['test_precision'].mean()
avg_recall_ros = scores['test_recall'].mean()
avg_f1_ros = scores['test_f1'].mean()


## Result

In [39]:
results = []

result = {
    "Imbalanced handle": "RandomOverSampler",
    "Data": "domain1",
    "bow / TFIDF": "BoW",
    "n-gram": "(1,1)",
    "parameters": "default KNN",
    "cross validation (avg. accuracy)": avg_accuracy_ros,
    "cross validation (avg. precision)": avg_precision_ros,
    "cross validation (avg. recall)": avg_recall_ros,
    "cross validation (avg. F1)": avg_f1_ros,
    "additional notes": ""
}

results.append(result)


In [40]:
df_results = pd.DataFrame(results)

# Save the dataframe to a CSV file
df_results.to_csv('../data/preprocessed/results_table.csv', index=False)

In [41]:
df_results

Unnamed: 0,Imbalanced handle,Data,bow / TFIDF,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes
0,RandomOverSampler,domain1,BoW,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,Any other observations or notes


## Make a Function

In [69]:
def train_KNN(domain, data, labels,gram):
    preprocess_methods = ["bow", "tfidf"]
    balance_methods = ["NA", "RandomOverSampler", "SMOTE", "RandomUnderSampler"]

    for preprocess in preprocess_methods:
        if preprocess == "bow":
            bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,gram))
            X = bow_vectorizer.fit_transform(data)
        elif preprocess == "tfidf":
            tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,gram))
            X = tfidf_vectorizer.fit_transform(data)

        for balance in balance_methods:
            X_train, y_train = X, labels  # default to original data
            if balance == "RandomOverSampler":
                ros = RandomOverSampler(random_state=42)
                X_train, y_train = ros.fit_resample(X, labels)
            elif balance == "SMOTE":
                smote = SMOTE(random_state=42)
                X_train, y_train = smote.fit_resample(X, labels)
            elif balance == "RandomUnderSampler":
                rus = RandomUnderSampler(random_state=42)
                X_train, y_train = rus.fit_resample(X, labels)

            # Create a KNN classifier instance
            knn = KNeighborsClassifier()

            scoring = ['accuracy', 'precision', 'recall', 'f1']
            scores = cross_validate(knn, X_train, y_train, cv=5, scoring=scoring)

            avg_accuracy = scores['test_accuracy'].mean()
            avg_precision = scores['test_precision'].mean()
            avg_recall = scores['test_recall'].mean()
            avg_f1 = scores['test_f1'].mean()

            result = {
                "Model": "KNN",
                "Domain": domain,
                "Preprocess": preprocess,
                "Imbalanced handle": balance,
                "n-gram": f'(1,{gram})',  
                "parameters": "default KNN",
                "cross validation (avg. accuracy)": avg_accuracy,
                "cross validation (avg. precision)": avg_precision,
                "cross validation (avg. recall)": avg_recall,
                "cross validation (avg. F1)": avg_f1,
                "additional notes": ""
            }

            results.append(result)


# Use the function
# Assuming `texts` is a list of tokenized texts and `labels` is their corresponding labels.
#texts = [" ".join(map(str, text)) for text in dataset['text']] # Convert token lists to space separated strings
#labels = dataset['label']


In [74]:
results = []
train_KNN("domain1", data_strings1, y1,1)
train_KNN("domain2", data_strings2, y2,1)
train_KNN("domain1", data_strings1, y1,3)
train_KNN("domain2", data_strings2, y2,3)
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Domain,Preprocess,Imbalanced handle,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes
0,domain1,bow,,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,
1,domain1,bow,RandomOverSampler,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,
2,domain1,bow,SMOTE,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,
3,domain1,bow,RandomUnderSampler,"(1,1)",default KNN,0.69441,0.722974,0.629436,0.670254,
4,domain1,tfidf,,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,
5,domain1,tfidf,RandomOverSampler,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,
6,domain1,tfidf,SMOTE,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,
7,domain1,tfidf,RandomUnderSampler,"(1,1)",default KNN,0.588872,0.612073,0.489846,0.524473,
8,domain2,bow,,"(1,1)",default KNN,0.845235,0.16157,0.016744,0.030159,
9,domain2,bow,RandomOverSampler,"(1,1)",default KNN,0.775843,0.711367,0.928471,0.805504,


In [75]:
df_results['Model'] = 'KNN'
df_results

Unnamed: 0,Domain,Preprocess,Imbalanced handle,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes,Model
0,domain1,bow,,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,,KNN
1,domain1,bow,RandomOverSampler,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,,KNN
2,domain1,bow,SMOTE,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,,KNN
3,domain1,bow,RandomUnderSampler,"(1,1)",default KNN,0.69441,0.722974,0.629436,0.670254,,KNN
4,domain1,tfidf,,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,,KNN
5,domain1,tfidf,RandomOverSampler,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,,KNN
6,domain1,tfidf,SMOTE,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,,KNN
7,domain1,tfidf,RandomUnderSampler,"(1,1)",default KNN,0.588872,0.612073,0.489846,0.524473,,KNN
8,domain2,bow,,"(1,1)",default KNN,0.845235,0.16157,0.016744,0.030159,,KNN
9,domain2,bow,RandomOverSampler,"(1,1)",default KNN,0.775843,0.711367,0.928471,0.805504,,KNN


For domain1, recommend using BoW with RandomUnderSampler.

For domain2, BoW with RandomOverSampler seems to be the best approach.


## Result Analysis of KNN

### Top 5 accuracy
May be due to overfitting as D2 is imbalanced

In [95]:
top_5_accuracies = df_results.nlargest(5, 'cross validation (avg. accuracy)')
top_5_accuracies

Unnamed: 0,Domain,Preprocess,Imbalanced handle,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes,Model
8,domain2,bow,,"(1,1)",default KNN,0.845235,0.16157,0.016744,0.030159,,KNN
24,domain2,bow,,"(1,3)",default KNN,0.843423,0.138221,0.016279,0.029017,,KNN
28,domain2,tfidf,,"(1,3)",default KNN,0.835705,0.14585,0.020465,0.033337,,KNN
12,domain2,tfidf,,"(1,1)",default KNN,0.829195,0.096048,0.02093,0.033423,,KNN
9,domain2,bow,RandomOverSampler,"(1,1)",default KNN,0.775843,0.711367,0.928471,0.805504,,KNN


### 3 gram or 1 gram
3 gram better

In [83]:
agg_metrics = df_results.groupby(['n-gram']).agg({
    'cross validation (avg. accuracy)': ['mean', 'std'],
    'cross validation (avg. precision)': ['mean', 'std'],
    'cross validation (avg. recall)': ['mean', 'std'],
    'cross validation (avg. F1)': ['mean', 'std']
})
agg_metrics
# 3 gram better than 1 gram

Unnamed: 0_level_0,cross validation (avg. accuracy),cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. recall),cross validation (avg. F1),cross validation (avg. F1)
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
n-gram,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(1,1)",0.641012,0.10773,0.557568,0.186061,0.587686,0.295854,0.54175,0.219382
"(1,3)",0.645262,0.106982,0.543083,0.169517,0.669418,0.294394,0.581496,0.225542


Both models seem to have comparable performances, with the (1,3) n-gram model having a slight edge in terms of recall and F1 score. 

### Bow / TFIDF
bow better than tfidf

In [91]:
agg_metrics = df_results[df_results['n-gram'] == "(1,3)"].groupby(['Domain',"Preprocess"]).agg({
    'cross validation (avg. accuracy)': ['mean', 'std'],
    'cross validation (avg. precision)': ['mean', 'std'],
    'cross validation (avg. recall)': ['mean', 'std'],
    'cross validation (avg. F1)': ['mean', 'std']
})
agg_metrics
#bow better than tfidf

Unnamed: 0_level_0,Unnamed: 1_level_0,cross validation (avg. accuracy),cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. recall),cross validation (avg. F1),cross validation (avg. F1)
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
Domain,Preprocess,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
domain1,bow,0.699756,0.002282,0.675838,0.002153,0.767051,0.00241,0.717918,0.002315
domain1,tfidf,0.606872,0.000103,0.6015,0.000138,0.635846,0.001744,0.60958,0.000592
domain2,bow,0.664401,0.162274,0.467328,0.234281,0.628719,0.450535,0.511908,0.338062
domain2,tfidf,0.610021,0.151058,0.427665,0.187951,0.646054,0.461736,0.486579,0.309384


### Imbalance handle
D1: UnderSampler; D2: NA (may be overfitting) and RandomOverSampler

In [93]:
agg_metrics = df_results.groupby(['Domain',"Imbalanced handle"]).agg({
    'cross validation (avg. accuracy)': ['mean', 'std'],
    'cross validation (avg. precision)': ['mean', 'std'],
    'cross validation (avg. recall)': ['mean', 'std'],
    'cross validation (avg. F1)': ['mean', 'std']
})
agg_metrics


Unnamed: 0_level_0,Unnamed: 1_level_0,cross validation (avg. accuracy),cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. recall),cross validation (avg. F1),cross validation (avg. F1)
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
Domain,Imbalanced handle,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
domain1,,0.646782,0.055194,0.653256,0.052823,0.630436,0.109181,0.626662,0.085002
domain1,RandomOverSampler,0.646782,0.055194,0.653256,0.052823,0.630436,0.109181,0.626662,0.085002
domain1,RandomUnderSampler,0.648295,0.058875,0.653956,0.057383,0.632103,0.114724,0.631646,0.084615
domain1,SMOTE,0.646782,0.055194,0.653256,0.052823,0.630436,0.109181,0.626662,0.085002
domain2,,0.838389,0.007393,0.135422,0.027992,0.018605,0.002432,0.031484,0.002239
domain2,RandomOverSampler,0.666088,0.118216,0.618592,0.097032,0.958451,0.031558,0.747025,0.062136
domain2,RandomUnderSampler,0.516686,0.01111,0.516497,0.010821,0.529535,0.047119,0.520434,0.02727
domain2,SMOTE,0.535294,0.009784,0.518366,0.005325,0.998412,0.000711,0.682411,0.004535


# NN

In [18]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

results2 = []

class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

def train_NN(domain, data, labels, gram):
    preprocess_methods = ["bow", "tfidf"]
    balance_methods = ["NA", "RandomOverSampler", "SMOTE", "RandomUnderSampler"]
    
    # Preprocess labels for NN
    encoder = LabelEncoder()
    labels_encoded = encoder.fit_transform(labels)
    labels_encoded = torch.tensor(labels_encoded, dtype=torch.long)
    
    # Tokenize the data (assuming data is a list of texts)
    vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, gram))
    X = vectorizer.fit_transform(data).toarray()
    X = torch.tensor(X, dtype=torch.float32)
    
    for preprocess in preprocess_methods:
        for balance in balance_methods:
            X_train, y_train = X, labels_encoded
            
            if balance == "RandomOverSampler":
                ros = RandomOverSampler(random_state=42)
                X_train, y_train = ros.fit_resample(X, labels_encoded)
            elif balance == "SMOTE":
                smote = SMOTE(random_state=42)
                X_train, y_train = smote.fit_resample(X, labels_encoded)
            elif balance == "RandomUnderSampler":
                rus = RandomUnderSampler(random_state=42)
                X_train, y_train = rus.fit_resample(X, labels_encoded)

            # Convert to torch tensors
            X_train = torch.tensor(X_train, dtype=torch.float32)
            y_train = torch.tensor(y_train, dtype=torch.long)

            # DataLoader
            dataset = TensorDataset(X_train, y_train)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)

            # Define NN model
            model = SimpleNN(X_train.shape[1], len(np.unique(labels_encoded)))

            # Loss and optimizer
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            # Training loop
            for epoch in range(10):
                for batch_x, batch_y in loader:
                    # Forward pass
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)

                    # Backward pass and optimize
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
            scores = cross_validate_pytorch(model, X_train, y_train, cv=5)


            result = {
                "Model": "NN",
                "Domain": domain,
                "Preprocess": preprocess,
                "Imbalanced handle": balance,
                "n-gram": f'(1,{gram})',  # Adjust if you change ngram_range in vectorizers
                "parameters": "default NN",
                "cross validation (avg. accuracy)": scores,
                "additional notes": ""
            }
            results2.append(result)

# Usage example:
# train_NN('sample_domain', data, labels, 1)


In [19]:
results2 = []
train_NN("domain1", data_strings1, y1,1)
#train_NN("domain2", data_strings2, y2,1)
#train_NN("domain1", data_strings1, y1,3)
#train_NN("domain2", data_strings2, y2,3)
#df_results2 = pd.DataFrame(results2)
df_results2

  X_train = torch.tensor(X_train, dtype=torch.float32)
  y_train = torch.tensor(y_train, dtype=torch.long)


NameError: name 'cross_validate_pytorch' is not defined