In [None]:
#Import packages
import json
import sklearn
import numpy as np
import pandas as pd

In [2]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA, PCA, SparsePCA
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest, chi2, SelectPercentile
from sklearn.model_selection import KFold

In [3]:
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer

In [4]:
#Load datasets
with open("domain1_train_data.json", "r") as f:
    dataset_1 = [ json.loads(line, parse_int = str) for line in f ]

with open("domain2_train_data.json", "r") as f:
    dataset_2 = [ json.loads(line, parse_int = str) for line in f ]

with open("test_data.json", "r") as f:
    testset = [ json.loads(line, parse_int = str) for line in f ]

n_samples_1 = len(dataset_1)
n_samples_2 = len(dataset_2)
n_tests = len(testset)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def feature_select( texts: list[str], *, vocabulary: dict = None, method="countvectorize", sparse=False, **kwargs ):
    """From a list of texts, output a dataframe of features with shape (n_samples, n_features).

     Args:
         texts (list[str]): list of strings, each item corresponding to a text.
         vocabulary (dict, optional): _description_. Defaults to None.
         method (str, optional): Method to select features. Defaults to "count-vectorizer".
         **kwargs: kwarg arguments to pass to Vectorizer classes of sklearn.
    Raises:
        ValueError: If passing an non-specified method of text feature extraction

     Returns:
         pd.DataFrame: dataframe of shape (n_samples, n_features)
    """
    #We want single digits to tokenized. This regex considers everything as a token except whitespace.
    kwargs['token_pattern'] = r'\S+' 
    if method == "countvectorize":
        vectorizer = CountVectorizer(vocabulary = vocabulary, **kwargs) if vocabulary else CountVectorizer(**kwargs)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(vocabulary = vocabulary, **kwargs) if vocabulary else TfidfVectorizer(**kwargs)
    else:
        raise ValueError(f"{method} is not a supported method.")
    if not sparse:
        X = vectorizer.fit_transform(texts)
        feature_names = vectorizer.get_feature_names_out()
        df = pd.DataFrame.sparse.from_spmatrix(data=X, columns = feature_names)
        return df, vectorizer
    else:
        X = vectorizer.fit_transform(texts)
        return X, vectorizer

In [6]:
# For separating the domains, create a binary classifier (call it domain classifier), 
# label 0 for domain 1, label 1 for domain 2.
datatexts_1 = []
datatexts_2 = []
for instance in dataset_1:
    datatexts_1 += [ " ".join(instance["text"]) ]
for instance in dataset_2:
    datatexts_2 += [ " ".join(instance["text"]) ]

In [7]:
#It is probably enough to only capture single-words (1-grams) for domain classifying.
X, vectorizer_domain = feature_select(texts = datatexts_1 + datatexts_2, 
                method='tfidf',
                ngram_range=(1,1),
                sparse=True,
                max_df=0.9975, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                min_df=0.0025, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                )
n_samples, n_attrs = X.shape
print(f"no. samples: {n_samples}, no. attributes: {n_attrs}")

no. samples: 18000, no. attributes: 5304


In [8]:
#Domain labels:
y = np.array( [ 0 ]*len(datatexts_1) + [ 1 ]*len(datatexts_2) )

In [9]:
my_selector = SelectKBest( f_classif, k = 5304 )
X_reduced = my_selector.fit_transform( X.toarray(), y ) #X_reduced is not a sparse array, unlike X

In [10]:
classifier = GaussianNB()
scores = []
crossvalid = KFold(n_splits = 10, shuffle=True, random_state=20241)
for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced)):
    print(f"Fold {i}:")
    classifier.fit(X_reduced[train_index,:], y[train_index])
    score = classifier.score(X_reduced[test_index,:], y[test_index])
    scores += [score]
    print(f"score: {score}")

Fold 0:
score: 0.9816666666666667
Fold 1:
score: 0.9916666666666667
Fold 2:
score: 0.9866666666666667
Fold 3:
score: 0.9866666666666667
Fold 4:
score: 0.9911111111111112
Fold 5:
score: 0.9927777777777778
Fold 6:
score: 0.9866666666666667
Fold 7:
score: 0.9888888888888889
Fold 8:
score: 0.9894444444444445
Fold 9:
score: 0.9905555555555555


In [11]:
#Gaussian NB does pretty well at domain classifying, so let's train on the whole dataset:
domain_classifier = GaussianNB()
domain_classifier.fit(X_reduced, y)

In [12]:
#Now that we have a domain classifyer, we can build a separate ML model for each domain:
#This time include up to n-grams.
n = 2
X_1, vectorizer_1 = feature_select(texts = datatexts_1, 
                method='tfidf',
                ngram_range=(1,n), 
                sparse=False,
                max_df=0.9975, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                min_df=0.0025, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                )
n_samples, n_attrs = X_1.shape
print(f"no. samples: {n_samples}, no. attributes: {n_attrs}")

no. samples: 5000, no. attributes: 11106


In [13]:
y_1 = np.array([ dataset_1[i]['label'] for i in range(n_samples) ])

In [62]:
classifier = MultinomialNB()
scores = []
for p in np.arange(5,105,5):
    my_selector_1 = SelectPercentile( chi2, percentile = p )
    X_reduced_1 = my_selector_1.fit_transform( X_1, y_1 ).toarray()
    crossvalid = KFold(n_splits = 5, shuffle=True, random_state=2024)
    for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced_1)):
        print(f"\tFold {i}:")
        classifier.fit(X_reduced_1[train_index,:], y_1[train_index])
        score = classifier.score(X_reduced_1[test_index,:], y_1[test_index])
        scores += [score]
        print(f"\tscore: {score}")
    print( f"p: {p}, Mean: {np.mean(scores)}" )

	Fold 0:
	score: 0.772
	Fold 1:
	score: 0.793
	Fold 2:
	score: 0.757
	Fold 3:
	score: 0.782
	Fold 4:
	score: 0.768
p: 5, Mean: 0.7744
	Fold 0:
	score: 0.797
	Fold 1:
	score: 0.803
	Fold 2:
	score: 0.797
	Fold 3:
	score: 0.826
	Fold 4:
	score: 0.797
p: 10, Mean: 0.7891999999999999
	Fold 0:
	score: 0.819
	Fold 1:
	score: 0.824
	Fold 2:
	score: 0.8
	Fold 3:
	score: 0.836
	Fold 4:
	score: 0.821
p: 15, Mean: 0.7994666666666667
	Fold 0:
	score: 0.827
	Fold 1:
	score: 0.823
	Fold 2:
	score: 0.814
	Fold 3:
	score: 0.839
	Fold 4:
	score: 0.831
p: 20, Mean: 0.8063
	Fold 0:
	score: 0.823
	Fold 1:
	score: 0.806
	Fold 2:
	score: 0.807
	Fold 3:
	score: 0.847
	Fold 4:
	score: 0.821
p: 25, Mean: 0.8092
	Fold 0:
	score: 0.823
	Fold 1:
	score: 0.818
	Fold 2:
	score: 0.815
	Fold 3:
	score: 0.846
	Fold 4:
	score: 0.828
p: 30, Mean: 0.812
	Fold 0:
	score: 0.82
	Fold 1:
	score: 0.819
	Fold 2:
	score: 0.816
	Fold 3:
	score: 0.843
	Fold 4:
	score: 0.829
p: 35, Mean: 0.8139142857142857
	Fold 0:
	score: 0.818
	

In [14]:
#Take p = 50, then train on full dataset:
classifier_1 = MultinomialNB()
p=55
my_selector_1 = SelectPercentile( chi2, percentile = p )
X_reduced_1 = my_selector_1.fit_transform( X_1, y_1 ).toarray()
classifier_1.fit(X_reduced_1, y_1)

In [15]:
#Now repeat for domain 2:
n = 2
X_2, vectorizer_2 = feature_select(texts = datatexts_2, 
                method='tfidf',
                ngram_range=(1,n), 
                sparse=False,
                max_df=0.9975, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                min_df=0.0025, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                )
n_samples, n_attrs = X_2.shape
print(f"no. samples: {n_samples}, no. attributes: {n_attrs}")

no. samples: 13000, no. attributes: 15477


In [16]:
y_2 = np.array([ dataset_2[i]['label'] for i in range(n_samples) ])

In [24]:
classifier = MultinomialNB()
scores = []
for p in np.arange(10,105,10):
    my_selector_2 = SelectPercentile( chi2, percentile = p )
    crossvalid = KFold(n_splits = 5, shuffle=True, random_state=2024)
    X_reduced_2 = my_selector_2.fit_transform( X_2, y_2 ).toarray()
    for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced_2)):
        print(f"\tFold {i}:")
        classifier.fit(X_reduced_2[train_index,:], y_2[train_index])
        score = classifier.score(X_reduced_2[test_index,:], y_2[test_index])
        scores += [score]
        print(f"\tscore: {score}")
    print( f"p: {p}, Mean: {np.mean(scores)}" )
    print()

	Fold 0:
	score: 0.8830769230769231
	Fold 1:
	score: 0.8938461538461538
	Fold 2:
	score: 0.8946153846153846
	Fold 3:
	score: 0.88
	Fold 4:
	score: 0.8734615384615385
p: 10, Mean: 0.885

	Fold 0:
	score: 0.8826923076923077
	Fold 1:
	score: 0.895
	Fold 2:
	score: 0.895
	Fold 3:
	score: 0.88
	Fold 4:
	score: 0.8738461538461538
p: 20, Mean: 0.8851538461538461

	Fold 0:
	score: 0.8826923076923077
	Fold 1:
	score: 0.8946153846153846
	Fold 2:
	score: 0.8946153846153846
	Fold 3:
	score: 0.8792307692307693
	Fold 4:
	score: 0.8738461538461538
p: 30, Mean: 0.885102564102564

	Fold 0:
	score: 0.8823076923076923
	Fold 1:
	score: 0.8942307692307693
	Fold 2:
	score: 0.8946153846153846
	Fold 3:
	score: 0.8792307692307693
	Fold 4:
	score: 0.8734615384615385
p: 40, Mean: 0.8850192307692308

	Fold 0:
	score: 0.8819230769230769
	Fold 1:
	score: 0.8942307692307693
	Fold 2:
	score: 0.8946153846153846
	Fold 3:
	score: 0.8784615384615385
	Fold 4:
	score: 0.8734615384615385
p: 50, Mean: 0.8849230769230769

	Fo

In [20]:
labels_0 = []
labels_1 = []
for inst in dataset_2:
    if inst['label']=="0":
        labels_0 += [ inst ]
    else:
        labels_1 += [ inst ]

In [21]:
from random import shuffle, seed
seed(2024)

In [22]:
shuffle(labels_0)

In [23]:
dataset_undersampled = labels_0[:1500] + labels_1
len(dataset_undersampled)

3000

In [24]:
dataset_undersampled_texts = [] 
y_undersampled = []
for instance in dataset_undersampled:
    dataset_undersampled_texts += [ " ".join(instance["text"]) ]
    y_undersampled += [ int(instance['label']) ]
y_undersampled = np.array( y_undersampled )

In [25]:
X_undersampled_2 = vectorizer_2.transform( dataset_undersampled_texts )

In [70]:
classifier = MultinomialNB()
scores = []
for p in np.arange(5,105,5):
    my_selector_2 = SelectPercentile( chi2, percentile = p )
    crossvalid = KFold(n_splits = 5, shuffle=True, random_state=2024)
    X_reduced_2 = my_selector_2.fit_transform( X_undersampled_2, y_undersampled ).toarray()
    for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced_2)):
        print(f"\tFold {i}:")
        classifier.fit(X_reduced_2[train_index,:], y_2[train_index])
        score = classifier.score(X_reduced_2[test_index,:], y_2[test_index])
        scores += [score]
        print(f"\tscore: {score}")
    print( f"p: {p}, Mean: {np.mean(scores)}" )
    print()

	Fold 0:
	score: 0.845
	Fold 1:
	score: 0.82
	Fold 2:
	score: 0.7883333333333333
	Fold 3:
	score: 0.8116666666666666
	Fold 4:
	score: 0.7933333333333333
p: 5, Mean: 0.8116666666666665

	Fold 0:
	score: 0.8466666666666667
	Fold 1:
	score: 0.83
	Fold 2:
	score: 0.815
	Fold 3:
	score: 0.81
	Fold 4:
	score: 0.8
p: 10, Mean: 0.8160000000000002

	Fold 0:
	score: 0.8566666666666667
	Fold 1:
	score: 0.825
	Fold 2:
	score: 0.8066666666666666
	Fold 3:
	score: 0.8133333333333334
	Fold 4:
	score: 0.8066666666666666
p: 15, Mean: 0.817888888888889

	Fold 0:
	score: 0.8633333333333333
	Fold 1:
	score: 0.8266666666666667
	Fold 2:
	score: 0.8116666666666666
	Fold 3:
	score: 0.8133333333333334
	Fold 4:
	score: 0.81
p: 20, Mean: 0.8196666666666668

	Fold 0:
	score: 0.8633333333333333
	Fold 1:
	score: 0.8316666666666667
	Fold 2:
	score: 0.8066666666666666
	Fold 3:
	score: 0.8066666666666666
	Fold 4:
	score: 0.82
p: 25, Mean: 0.8208666666666667

	Fold 0:
	score: 0.8533333333333334
	Fold 1:
	score: 0.831666

In [26]:
#Take p = 45, then train on full dataset:
classifier_2 = MultinomialNB()
p=45
my_selector_2 = SelectPercentile( chi2, percentile = p )
X_reduced_2 = my_selector_2.fit_transform( X_undersampled_2, y_undersampled ).toarray()
classifier_2.fit(X_reduced_2, y_undersampled)

In [27]:
test_texts = []
for instance in testset:
    test_texts += [" ".join(instance["text"])] 
    
#First, domain classify
X_test = vectorizer_domain.transform( test_texts ).toarray()
X_test = my_selector.transform(X_test)
domain_predictions = domain_classifier.predict(X_test)
X_test.shape,domain_predictions.shape

((4000, 5304), (4000,))

In [28]:
#Next, classify according to domain 1 and 2 models:
test_1 = vectorizer_1.transform( test_texts ).toarray()
test_1 = my_selector_1.transform(test_1)
if_domain1_predicted = classifier_1.predict( test_1 )
if_domain1_predicted.shape



(4000,)

In [29]:
#Next, classify according to domain 1 and 2 models:
test_2 = vectorizer_2.transform( test_texts ).toarray()
test_2 = my_selector_2.transform(test_2)
if_domain2_predicted = classifier_2.predict( test_2 )
if_domain2_predicted.shape

(4000,)

In [30]:
final_predictions = []
for i in range(4000):
    if domain_predictions[i] == 0: #instance must be from domain 1 according to domain classifier
        final_predictions += [ int(if_domain1_predicted[i]) ]
    elif domain_predictions[i] == 1: #instance must be from domain 2 according to domain classifier
        final_predictions += [ int(if_domain2_predicted[i]) ]
len(final_predictions)

4000

In [31]:
predictions = pd.DataFrame( final_predictions, index=range(n_tests), columns=[ "class" ])

In [32]:
predictions.value_counts()

class
1        2253
0        1747
dtype: int64

In [34]:
predictions.to_csv("sample.csv", sep=",", header=True, index_label="id")

In [97]:
!kaggle competitions submit -c comp90051-2024s1-project-1 -f sample.csv -m "classify domain first and train 2 different models for each domain. All models use Naive Bayes."

100%|██████████████████████████████████████| 26.3k/26.3k [00:01<00:00, 24.4kB/s]
Successfully submitted to COMP90051 2024S1 Project 1