In [1]:
#Import packages
import json
import sklearn
import numpy as np
import pandas as pd

In [2]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA, PCA, SparsePCA
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest, chi2, SelectPercentile
from sklearn.model_selection import KFold

In [3]:
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer

In [4]:
#Load datasets
with open("domain1_train_data.json", "r") as f:
    dataset_1 = [ json.loads(line, parse_int = str) for line in f ]

with open("domain2_train_data.json", "r") as f:
    dataset_2 = [ json.loads(line, parse_int = str) for line in f ]

with open("test_data.json", "r") as f:
    testset = [ json.loads(line, parse_int = str) for line in f ]

n_samples_1 = len(dataset_1)
n_samples_2 = len(dataset_2)
n_tests = len(testset)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_vectorizer( texts: list[str], *, method="countvectorize", **kwargs ):
    """From a list of texts, output an appropriate vectorizer either using CountVectorizer or TF-IDF depending on method argument. 

     Args:
         texts (list[str]): list of strings, each item corresponding to a text.
         method (str, optional): Method to select features. Defaults to "count-vectorizer".
         **kwargs: kwarg arguments to pass to Vectorizer classes of sklearn.
    Raises:
        ValueError: If passing an non-specified method of text feature extraction

     Returns:
         pd.DataFrame: dataframe of shape (n_samples, n_features)
    """
    
    #We want single digits to tokenized. This regex considers everything as a token except whitespace.
    kwargs['token_pattern'] = r'\S+' 
    if method == "countvectorize":
        vectorizer = CountVectorizer(**kwargs)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(**kwargs)
    else:
        raise ValueError(f"{method} is not a supported method.")
    
    #Use texts to initialize vocabulary of vectorizer
    vectorizer.fit(texts)
    return vectorizer

In [6]:
#First, create sentences (documents) from list of words for each instance in two domains
datatexts_1 = []
datatexts_2 = []
for instance in dataset_1:
    datatexts_1 += [ " ".join(instance["text"]) ]
for instance in dataset_2:
    datatexts_2 += [ " ".join(instance["text"]) ]

# label 0 for domain 1, label 1 for domain 2.
y = np.array( [ 0 ]*len(datatexts_1) + [ 1 ]*len(datatexts_2) )

#For later: do the same for test set
test_texts = []
for instance in testset:
    test_texts += [" ".join(instance["text"])] 

### For separating the domains, let's first create a binary classifier that predicts the domain an instance is from. (call it domain classifier). Then let's construct two more classifiers for *each* domain separately.

In [7]:
#First, our feature extraction for domain classifier,
method = 'tfidf'
vectorizer_domain = get_vectorizer( texts = datatexts_1 + datatexts_2 + test_texts, #Include vocabulary from all dataset
                                    method=method,
                                    ngram_range=(1,1), #It should be enough to use single-words (1-grams) for domain classifying.
                                    max_df=0.9995, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                    min_df=0.0005, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                    )

#Two more vectorizers for each domain:
n_gram_max = 2 #length of sequence of words to extract
vectorizer_1 = get_vectorizer(texts = datatexts_1 + test_texts, #This time, include only vocab from domain 1 and test set
                                method='tfidf',
                                ngram_range=(1,n_gram_max), 
                                max_df=0.9975, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                min_df=0.0025, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                )

vectorizer_2 = get_vectorizer( texts = datatexts_2 + test_texts, #This time, include only vocab from domain 2 and test set
                                method='tfidf',
                                ngram_range=(1,n_gram_max),
                                max_df=0.9975, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                min_df=0.0025, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                )

n_domain_features = vectorizer_domain.get_feature_names_out().size
n_classifier_1_features = vectorizer_1.get_feature_names_out().size
n_classifier_2_features = vectorizer_2.get_feature_names_out().size
print("number of features that is generated by each extractor:")
print(f"\tdomain classifier: {n_domain_features}")
print(f"\tdomain 1 classifier: {n_classifier_1_features}")
print(f"\tdomain 2 classifier: {n_classifier_2_features}")

number of features that is generated by each extractor:
	domain classifier: 13307
	domain 1 classifier: 11836
	domain 2 classifier: 14723


In [8]:
#Now we can extract features from our dataset:
X_domain = vectorizer_domain.transform( datatexts_1 + datatexts_2 ).toarray()
y_domain = np.array( [0]*len(datatexts_1) + [1]*len(datatexts_2) ) #label 0 for domain 1, label 1 for domain 2

X_1 = vectorizer_1.transform( datatexts_1 ).toarray()
y_1 = np.array( [ instance['label'] for instance in dataset_1 ] )

X_2 = vectorizer_2.transform( datatexts_2 ).toarray()
y_2 = np.array( [ instance['label'] for instance in dataset_2 ] )

print("Feature array sizes (n_samples x n_features):")
print(f"\tcombined set: {X_domain.shape}")
print(f"\tdomain 1 set: {X_1.shape}")
print(f"\tdomain 2 set: {X_2.shape}")

Feature array sizes (n_samples x n_features):
	combined set: (18000, 13307)
	domain 1 set: (5000, 11836)
	domain 2 set: (13000, 14723)


### Domain classifier

In [9]:
from IPython.display import clear_output

In [10]:
#For now using SelectPercentile with chi2 metric:
alphas = np.arange(0.1,1.1,0.1)
percentiles = np.arange(10,60,5)
mean_scores = np.zeros( (alphas.size, percentiles.size) )
n_cv_splits = 10
for m, alpha in enumerate(alphas):
    for n, p in enumerate(percentiles):
        my_selector = SelectPercentile( chi2, percentile = p )
        X_reduced = my_selector.fit_transform( X_domain, y_domain )
        classifier = MultinomialNB(alpha=alpha)
        crossvalid = KFold(n_splits = n_cv_splits, shuffle=True, random_state=2024+m+n)
        scores = []
        for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced)):
            print(f"\tFold {i}:")
            classifier.fit(X_reduced[train_index,:], y_domain[train_index])
            score = classifier.score(X_reduced[test_index,:], y_domain[test_index])
            scores += [score]
            print(f"\tscore: {score}")
        mean_scores[m,n] = np.mean( scores )
        clear_output(wait = True)
        print(f"\talpha: {alpha}, percentile: {p}, mean score: {mean_scores[m,n]}")
        print()

	alpha: 1.0, percentile: 55, mean score: 0.9981666666666665



### NB does pretty well at domain classifying, so let's take the best hyperparameters from grid search and train on whole dataset.
#### Consistently high C.V. accuracies tell us that the two domains are very distringuishable from just looking at its single word contents!

In [11]:
#Corresponding indices of where the best mean score is obtained from C.V.
a_index = np.where( mean_scores == mean_scores.max() )[0][0]
p_index = np.where( mean_scores == mean_scores.max() )[1][0]
print(f"Best mean score: {mean_scores.max()}")
print(f"Best alpha: {alphas[a_index]}")
print(f"Best percentile: {percentiles[p_index]}")

Best mean score: 0.9987777777777778
Best alpha: 0.1
Best percentile: 45


In [12]:
#Now construct feature selector according to best percentile, and MNB with laplace smooth using best alpha.
domain_selector = SelectPercentile( chi2, percentile = float(percentiles[p_index]) )
domain_MNBclassifier = MultinomialNB( alpha = float(alphas[a_index]) )

#Choose reduced features then fit MNB.
X_reduced = domain_selector.fit_transform( X_domain, y_domain )
domain_MNBclassifier.fit(X_reduced, y_domain)

### AI (0) vs Human (1) classifier for domain 1

In [13]:
scores = []
alphas = np.arange(0.1,1.1,0.1)
percentiles = np.arange(5,105,5)
mean_scores = np.zeros( (alphas.size, percentiles.size) )
n_cv_splits = 5
for m, alpha in enumerate(alphas):
    classifier = MultinomialNB(alpha=alpha)
    for n, p in enumerate(percentiles):
        my_selector = SelectPercentile( chi2, percentile = p )
        X_reduced_1 = my_selector.fit_transform( X_1, y_1 )
        crossvalid = KFold(n_splits = 5, shuffle=True, random_state=2024+m+n)
        scores = []
        for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced_1)):
            print(f"\tFold {i}:")
            classifier.fit(X_reduced_1[train_index,:], y_1[train_index])
            score = classifier.score(X_reduced_1[test_index,:], y_1[test_index])
            scores += [score]
            print(f"\tscore: {score}")
        mean_scores[m,n] = np.mean( scores )
        clear_output(wait = True)
        print(f"\talpha: {alpha}, percentile: {p}, mean score: {mean_scores[m,n]}")
        print()

	alpha: 1.0, percentile: 100, mean score: 0.7484



In [14]:
#Corresponding indices of where the best mean score is obtained from C.V.
a_index = np.where( mean_scores == mean_scores.max() )[0][0]
p_index = np.where( mean_scores == mean_scores.max() )[1][0]
print(f"Best mean score: {mean_scores.max()}")
print(f"Best alpha: {alphas[a_index]}")
print(f"Best percentile: {percentiles[p_index]}")

Best mean score: 0.8413999999999999
Best alpha: 0.1
Best percentile: 25


In [15]:
classifier_1 = MultinomialNB( alpha= float(alphas[a_index]) )
my_selector_1 = SelectPercentile( chi2, percentile = float(percentiles[p_index]) )
X_reduced_1 = my_selector_1.fit_transform( X_1, y_1 )
classifier_1.fit(X_reduced_1, y_1)

### AI (0) vs Human (1) classifier for domain 2

In [16]:
from random import shuffle, seed

In [17]:
#First, let's try undersampling by splitting dataset into 0 and 1 labels and choosing equal number from each, shuffling label 0 dataset to choose 1500 out of 16500
seed(20124)
labels_0 = []
labels_1 = []
for inst in dataset_2:
    if inst['label']=="0":
        labels_0 += [ inst ]
    else:
        labels_1 += [ inst ]
shuffle(labels_0)
dataset_undersampled = labels_0[:len(labels_1)] + labels_1 #Choose equal lengths
dataset_undersampled_texts = [] 
y_undersampled = []
for instance in dataset_undersampled:
    dataset_undersampled_texts += [ " ".join(instance["text"]) ]
    y_undersampled += [ int(instance['label']) ]

#Now extract features as before.
X_undersampled_2 = vectorizer_2.transform( dataset_undersampled_texts )
y_undersampled_2 = np.array( y_undersampled )

In [18]:
scores = []
alphas = np.arange(0.1,1.1,0.1)
percentiles = np.arange(5,105,5)
mean_scores = np.zeros( (alphas.size, percentiles.size) )
n_cv_splits = 5
for m, alpha in enumerate(alphas):
    classifier = MultinomialNB(alpha=alpha)
    for n, p in enumerate(percentiles):
        my_selector = SelectPercentile( chi2, percentile = p )
        X_reduced_2 = my_selector.fit_transform( X_undersampled_2, y_undersampled_2 )
        crossvalid = KFold(n_splits = 5, shuffle=True, random_state=2024+m+n)
        scores = []
        for i, (train_index, test_index) in enumerate(crossvalid.split(X_reduced_2)):
            print(f"\tFold {i}:")
            classifier.fit(X_reduced_2[train_index,:], y_2[train_index])
            score = classifier.score(X_reduced_2[test_index,:], y_2[test_index])
            scores += [score]
            print(f"\tscore: {score}")
        mean_scores[m,n] = np.mean( scores )
        clear_output(wait = True)
        print(f"\talpha: {alpha}, percentile: {p}, mean score: {mean_scores[m,n]}")
        print()

	alpha: 1.0, percentile: 100, mean score: 0.7713333333333334



In [19]:
#Corresponding indices of where the best mean score is obtained from C.V.
a_index = np.where( mean_scores == mean_scores.max() )[0][0]
p_index = np.where( mean_scores == mean_scores.max() )[1][0]
print(f"Best mean score: {mean_scores.max()}")
print(f"Best alpha: {alphas[a_index]}")
print(f"Best percentile: {percentiles[p_index]}")

Best mean score: 0.8653333333333334
Best alpha: 0.1
Best percentile: 30


In [20]:
classifier_2 = MultinomialNB( alpha=float(alphas[a_index]) )
my_selector_2 = SelectPercentile( chi2, percentile = float(percentiles[p_index]) )
X_reduced_2 = my_selector_2.fit_transform( X_undersampled_2, y_undersampled_2 )
classifier_2.fit(X_reduced_2, y_undersampled)

In [21]:
#Let's try testing on the other of label 0:
test_for_0labels = []
for instance in labels_0[1500:]:
    test_for_0labels += [ " ".join(instance["text"]) ]
test_for_0labels = vectorizer_2.transform( test_for_0labels )
test_for_0labels = my_selector_2.transform(test_for_0labels)
predict_0labels = classifier_2.predict(test_for_0labels)

In [22]:
#Percentage of wrong labels
predict_0labels.sum() / predict_0labels.size

0.311

In [24]:
#First, domain classify
X_test = vectorizer_domain.transform( test_texts )
X_test = domain_selector.transform( X_test )
domain_predictions = domain_MNBclassifier.predict(X_test)
X_test.shape,domain_predictions.shape

((4000, 5988), (4000,))

In [25]:
#Next, classify according to domain 1 and 2 models:
test_1 = vectorizer_1.transform( test_texts ).toarray()
test_1 = my_selector_1.transform(test_1)
if_domain1_predicted = classifier_1.predict( test_1 )
if_domain1_predicted.shape

(4000,)

In [26]:
#Next, classify according to domain 1 and 2 models:
test_2 = vectorizer_2.transform( test_texts ).toarray()
test_2 = my_selector_2.transform(test_2)
if_domain2_predicted = classifier_2.predict( test_2 )
if_domain2_predicted.shape

(4000,)

In [27]:
final_predictions = []
for i in range(4000):
    if domain_predictions[i] == 0: #instance must be from domain 1 according to domain classifier
        final_predictions += [ int(if_domain1_predicted[i]) ]
    elif domain_predictions[i] == 1: #instance must be from domain 2 according to domain classifier
        final_predictions += [ int(if_domain2_predicted[i]) ]
len(final_predictions)

4000

In [28]:
predictions = pd.DataFrame( final_predictions, index=range(n_tests), columns=[ "class" ])

In [29]:
predictions.value_counts()

class
1        2191
0        1809
dtype: int64

In [30]:
predictions.to_csv("sample.csv", sep=",", header=True, index_label="id")

In [31]:
!kaggle competitions submit -c comp90051-2024s1-project-1 -f sample.csv -m "Same models as before with tuned hyperparameters."

100%|██████████████████████████████████████| 26.3k/26.3k [00:02<00:00, 12.9kB/s]
400 - Bad Request - Submission not allowed:  Your team has used its daily Submission allowance (10) today, please try again tomorrow UTC (3.3 hours from now).
