In [1]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, average_precision_score

from compression_classifier.data.factory import EData, data_factory

from compression_classifier.compressor.factory import ECompressor

from compression_classifier.classifier.conc_dict.kmer_scoring import EScoring
from compression_classifier.classifier.conc_dict.utils import choose_k_value, choose_segment_size

from compression_classifier.classifier.models.tfidf import TfIdfSvmAutonomousClassifier
from compression_classifier.classifier.models.amdl import AmdlDictClassifier
from compression_classifier.classifier.models.ncd import NcdClassifier
from compression_classifier.classifier.models.lzjd import LzjdClassifier
from compression_classifier.classifier.models.zest import ZestClassifier
from compression_classifier.classifier.models.zest import ZestClassifier
from compression_classifier.classifier.models.conc_dict_lz4 import LZ4ConcDictLengthClassifier, LZ4ConcDictFeatureClassifier
from compression_classifier.classifier.models.conc_dict_zstd import ZstdConcDictClassifier
from compression_classifier.classifier.models.conc_dict_lz77 import LZ77ConcDictFeatureClassifier

In [2]:
data = data_factory('/datasets/', EData.AG_NEWS)
print('Full dataset statistics:')
print(data)

# Trim Dataset
print('Trimming dataset to 25%')
data.trim(0.25)

Full dataset statistics:
Total Train: 119915, Total Train: 7597
"World" (0), Train: 29988, Test: 1899, Min: 98, Avg: 241±63, Max: 857
"Sports" (1), Train: 29985, Test: 1900, Min: 98, Avg: 223±50, Max: 852
"Business" (2), Train: 29974, Test: 1898, Min: 99, Avg: 239±63, Max: 1005
"Sci/Tech" (3), Train: 29968, Test: 1900, Min: 98, Avg: 235±81, Max: 1011
Avg Length: 234±64
Trimming dataset to 25%


# Tf-Idf SVM (Baseline Model)

In [3]:
clf = TfIdfSvmAutonomousClassifier()
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Searching for best paramters!
Best Params, k: 6, Sublinear TF: True, C: 1
Accuracy: 0.91096


# AMDL

In [4]:
clf = AmdlDictClassifier(ECompressor.LZ4)
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Set Max Bytes to 65536
Accuracy: 0.89410


# NCD

In [5]:
clf = NcdClassifier(ECompressor.LZ4, n_neighbors=3)
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Accuracy: 0.87829


# Lempel-Ziv Jaccard Distance

In [6]:
clf = LzjdClassifier()
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Accuracy: 0.64647


# Zest

In [7]:
clf = ZestClassifier()
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Dictionary Sizes: [16384, 145092, 273800, 402508]
Accuracy: 0.87566


# Concentrated Dictionary

In [3]:
dict_size=2**16
k = choose_k_value(data)
segment_size = choose_segment_size(2**16, data)
print(f'Dict Size: {dict_size}, k-mer Size: {k}, Segment Size: {segment_size}')

Dict Size: 65536, k-mer Size: 8, Segment Size: 16


## Concentrated Dictionary (Length) (LZ4)

Term Frequency (tf) Scoring

In [5]:
clf = LZ4ConcDictLengthClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.86670, P@2: 0.97050, AP: 0.89692


PPMI sublinear Scoring

In [6]:
clf = LZ4ConcDictLengthClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.88567, P@2: 0.96891, AP: 0.91957


## Concentrated Dictionary (Length) (zstd)

Term Frequency (tf) Scoring

In [7]:
clf = ZstdConcDictClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.86091, P@2: 0.96891, AP: 0.88783


PPMI sublinear Scoring

In [8]:
clf = ZstdConcDictClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.87039, P@2: 0.96681, AP: 0.91477


## Concentrated Dictionary (Features) (LZ4)

Term Frequency (tf) Scoring

In [9]:
clf = LZ4ConcDictFeatureClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.87039, P@2: 0.96839, AP: 0.92419


PPMI sublinear Scoring

In [10]:
clf = LZ4ConcDictFeatureClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.88936, P@2: 0.97155, AP: 0.93444


## Concentrated Dictionary (Features) (LZ77)

Term Frequency (tf) Scoring

In [13]:
clf = LZ77ConcDictFeatureClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.87039, P@2: 0.96681, AP: 0.91974


PPMI sublinear Scoring

In [12]:
clf = LZ77ConcDictFeatureClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.01
P@1: 0.89146, P@2: 0.96628, AP: 0.92802
