# KNN

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import json
import pandas as pd

In [56]:
data1 = []
data2 = []
data3 = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data1.append(json.loads(line.strip()))
data_strings1 = [' '.join(map(str, sample['text'])) for sample in data1]


# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data2.append(json.loads(line.strip()))
data_strings2 = [' '.join(map(str, sample['text'])) for sample in data2]

with open("../data/domain1_train.json", 'r') as file:
    for line in file:
        data3.append(json.loads(line.strip()))
with open("../data/domain2_train.json", 'r') as file:
    for line in file:
        data3.append(json.loads(line.strip()))
data_strings4 = [' '.join(map(str, sample['text'])) for sample in data3]


test = []
# Assuming the NDJSON data is stored in a file called 'data.ndjson'
with open("../data/test_set.json", 'r') as file:
    for line in file:
        test.append(json.loads(line.strip()))


In [26]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

print(df1['label'])

0        1
1        1
2        1
3        1
4        1
        ..
19495    0
19496    0
19497    0
19498    0
19499    0
Name: label, Length: 19500, dtype: int64


## Preprocessing

In [61]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Using BoW:
bow_vectorizer = CountVectorizer(max_features=5000)
X_bow = bow_vectorizer.fit_transform(data_strings1)

# Using TFIDF:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data_strings1)

y1 = df1['label']
y2 = df2['label']


## Balancing

In [28]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Random OverSampler
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_bow, y) # Repeat for TFIDF

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_bow, y) # Repeat for TFIDF

# Random UnderSampler
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_bow, y) # Repeat for TFIDF


## Model Building & Evaluation:

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# Create a KNN classifier instance
knn = KNeighborsClassifier()

# Using Random OverSampling as an example:

scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(knn, X_ros, y_ros, cv=5, scoring=scoring)

avg_accuracy_ros = scores['test_accuracy'].mean()
avg_precision_ros = scores['test_precision'].mean()
avg_recall_ros = scores['test_recall'].mean()
avg_f1_ros = scores['test_f1'].mean()


## Result

In [39]:
results = []

result = {
    "Imbalanced handle": "RandomOverSampler",
    "Data": "domain1",
    "bow / TFIDF": "BoW",
    "n-gram": "(1,1)",
    "parameters": "default KNN",
    "cross validation (avg. accuracy)": avg_accuracy_ros,
    "cross validation (avg. precision)": avg_precision_ros,
    "cross validation (avg. recall)": avg_recall_ros,
    "cross validation (avg. F1)": avg_f1_ros,
    "additional notes": ""
}

results.append(result)


In [40]:
df_results = pd.DataFrame(results)

# Save the dataframe to a CSV file
df_results.to_csv('../data/preprocessed/results_table.csv', index=False)

In [41]:
df_results

Unnamed: 0,Imbalanced handle,Data,bow / TFIDF,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes
0,RandomOverSampler,domain1,BoW,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,Any other observations or notes


## Make a Function

In [69]:

def train_KNN(domain, data, labels,gram):
    preprocess_methods = ["bow", "tfidf"]
    balance_methods = ["NA", "RandomOverSampler", "SMOTE", "RandomUnderSampler"]

    for preprocess in preprocess_methods:
        if preprocess == "bow":
            bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,gram))
            X = bow_vectorizer.fit_transform(data)
        elif preprocess == "tfidf":
            tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,gram))
            X = tfidf_vectorizer.fit_transform(data)

        for balance in balance_methods:
            X_train, y_train = X, labels  # default to original data
            if balance == "RandomOverSampler":
                ros = RandomOverSampler(random_state=42)
                X_train, y_train = ros.fit_resample(X, labels)
            elif balance == "SMOTE":
                smote = SMOTE(random_state=42)
                X_train, y_train = smote.fit_resample(X, labels)
            elif balance == "RandomUnderSampler":
                rus = RandomUnderSampler(random_state=42)
                X_train, y_train = rus.fit_resample(X, labels)

            # Create a KNN classifier instance
            knn = KNeighborsClassifier()

            scoring = ['accuracy', 'precision', 'recall', 'f1']
            scores = cross_validate(knn, X_train, y_train, cv=5, scoring=scoring)

            avg_accuracy = scores['test_accuracy'].mean()
            avg_precision = scores['test_precision'].mean()
            avg_recall = scores['test_recall'].mean()
            avg_f1 = scores['test_f1'].mean()

            result = {
                "Model": "KNN",
                "Domain": domain,
                "Preprocess": preprocess,
                "Imbalanced handle": balance,
                "n-gram": f'(1,{gram})',  # Adjust if you change ngram_range in vectorizers
                "parameters": "default KNN",
                "cross validation (avg. accuracy)": avg_accuracy,
                "cross validation (avg. precision)": avg_precision,
                "cross validation (avg. recall)": avg_recall,
                "cross validation (avg. F1)": avg_f1,
                "additional notes": ""
            }

            results.append(result)


# Use the function
# Assuming `texts` is a list of tokenized texts and `labels` is their corresponding labels.
#texts = [" ".join(map(str, text)) for text in dataset['text']] # Convert token lists to space separated strings
#labels = dataset['label']


In [74]:
results = []
train_KNN("domain1", data_strings1, y1,1)
train_KNN("domain2", data_strings2, y2,1)
train_KNN("domain1", data_strings1, y1,3)
train_KNN("domain2", data_strings2, y2,3)
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Domain,Preprocess,Imbalanced handle,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes
0,domain1,bow,,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,
1,domain1,bow,RandomOverSampler,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,
2,domain1,bow,SMOTE,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,
3,domain1,bow,RandomUnderSampler,"(1,1)",default KNN,0.69441,0.722974,0.629436,0.670254,
4,domain1,tfidf,,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,
5,domain1,tfidf,RandomOverSampler,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,
6,domain1,tfidf,SMOTE,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,
7,domain1,tfidf,RandomUnderSampler,"(1,1)",default KNN,0.588872,0.612073,0.489846,0.524473,
8,domain2,bow,,"(1,1)",default KNN,0.845235,0.16157,0.016744,0.030159,
9,domain2,bow,RandomOverSampler,"(1,1)",default KNN,0.775843,0.711367,0.928471,0.805504,


In [75]:
df_results['Model'] = 'KNN'
df_results

Unnamed: 0,Domain,Preprocess,Imbalanced handle,n-gram,parameters,cross validation (avg. accuracy),cross validation (avg. precision),cross validation (avg. recall),cross validation (avg. F1),additional notes,Model
0,domain1,bow,,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,,KNN
1,domain1,bow,RandomOverSampler,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,,KNN
2,domain1,bow,SMOTE,"(1,1)",default KNN,0.689744,0.717215,0.622154,0.663145,,KNN
3,domain1,bow,RandomUnderSampler,"(1,1)",default KNN,0.69441,0.722974,0.629436,0.670254,,KNN
4,domain1,tfidf,,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,,KNN
5,domain1,tfidf,RandomOverSampler,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,,KNN
6,domain1,tfidf,SMOTE,"(1,1)",default KNN,0.591846,0.619617,0.498769,0.517459,,KNN
7,domain1,tfidf,RandomUnderSampler,"(1,1)",default KNN,0.588872,0.612073,0.489846,0.524473,,KNN
8,domain2,bow,,"(1,1)",default KNN,0.845235,0.16157,0.016744,0.030159,,KNN
9,domain2,bow,RandomOverSampler,"(1,1)",default KNN,0.775843,0.711367,0.928471,0.805504,,KNN


For domain1, recommend using BoW with RandomUnderSampler.

For domain2, BoW with RandomOverSampler seems to be the best approach.
