# Imports

In [1]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score, average_precision_score

from compression_classifier.data.factory import EData, data_factory

from compression_classifier.compressor.factory import ECompressor

from compression_classifier.classifier.conc_dict.kmer_scoring import EScoring
from compression_classifier.classifier.conc_dict.utils import choose_k_value, choose_segment_size

from compression_classifier.classifier.models.tfidf import TfIdfSvmAutonomousClassifier
from compression_classifier.classifier.models.amdl import AmdlDictClassifier
from compression_classifier.classifier.models.ncd import NcdClassifier
from compression_classifier.classifier.models.lzjd import LzjdClassifier
from compression_classifier.classifier.models.zest import ZestClassifier
from compression_classifier.classifier.models.zest import ZestClassifier
from compression_classifier.classifier.models.conc_dict_lz4 import LZ4ConcDictLengthClassifier, LZ4ConcDictFeatureClassifier
from compression_classifier.classifier.models.conc_dict_zstd import ZstdConcDictClassifier
from compression_classifier.classifier.models.conc_dict_lz77 import LZ77ConcDictFeatureClassifier

# Load Data

In [2]:
data = data_factory('/datasets/', EData.TWENTY_NEWS)
print('Full dataset statistics:')
print(data)

# Trim Dataset
print('Trimming dataset to 25%')
data.trim(0.25)

Full dataset statistics:
Total Train: 11314, Total Train: 7532
"alt.atheism" (0), Train: 480, Test: 319, Min: 225, Avg: 2104±3448, Max: 48329
"comp.graphics" (1), Train: 584, Test: 389, Min: 119, Avg: 1479±4506, Max: 58367
"comp.os.ms-windows.misc" (2), Train: 591, Test: 394, Min: 156, Avg: 2935±9926, Max: 62483
"comp.sys.ibm.pc.hardware" (3), Train: 590, Test: 392, Min: 157, Avg: 1277±1887, Max: 25172
"comp.sys.mac.hardware" (4), Train: 578, Test: 385, Min: 117, Avg: 1160±1946, Max: 44904
"comp.windows.x" (5), Train: 593, Test: 395, Min: 159, Avg: 1993±5879, Max: 64538
"misc.forsale" (6), Train: 585, Test: 390, Min: 139, Avg: 945±940, Max: 14484
"rec.autos" (7), Train: 594, Test: 396, Min: 127, Avg: 1394±1514, Max: 26006
"rec.motorcycles" (8), Train: 598, Test: 398, Min: 189, Avg: 1295±1574, Max: 34586
"rec.sport.baseball" (9), Train: 597, Test: 397, Min: 170, Avg: 1355±1386, Max: 15254
"rec.sport.hockey" (10), Train: 600, Test: 399, Min: 158, Avg: 1818±3321, Max: 68051
"sci.crypt" (1

# Tf-Idf SVM (Baseline) Model

In [3]:
clf = TfIdfSvmAutonomousClassifier()
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Searching for best paramters!
Best Params, k: 6, Sublinear TF: True, C: 10
Accuracy: 0.79448


# AMDL Model

In [4]:
clf = AmdlDictClassifier(ECompressor.LZ4)
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Set Max Bytes to 65536
Accuracy: 0.70685


# NCD Model

In [5]:
clf = NcdClassifier(ECompressor.LZ4, n_neighbors=1)
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Accuracy: 0.48646


# Lempel-Ziv Jaccard Distance Model

In [6]:
clf = LzjdClassifier()
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Accuracy: 0.36006


# Zest Model

In [7]:
clf = ZestClassifier()
clf.fit(data.X_train, data.y_train)
y_pred = clf.predict(data.X_test)
print(f'Accuracy: {accuracy_score(data.y_test, y_pred):.5f}')

Dictionary Sizes: [16384, 49709, 83034, 116359]
Accuracy: 0.64950


# Concentrated Dictionary Models

In [8]:
dict_size=2**16
k = choose_k_value(data)
segment_size = choose_segment_size(dict_size, data)
print(f'Dict Size: {dict_size}, k-mer Size: {k}, Segment Size: {segment_size}')

Dict Size: 65536, k-mer Size: 8, Segment Size: 1024


## Concentrated Dictionary (Length) (LZ4) Model

### Term Frequency (tf) Scoring

In [9]:
clf = LZ4ConcDictLengthClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.64843, P@2: 0.78173, AP: 0.54052


### PPMI sublinear Scoring

In [10]:
clf = LZ4ConcDictLengthClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.63994, P@2: 0.76049, AP: 0.53619


## Concentrated Dictionary (Length) (zstd) Model

### Term Frequency (tf) Scoring

In [11]:
clf = ZstdConcDictClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.63091, P@2: 0.76155, AP: 0.50912


### PPMI sublinear Scoring

In [12]:
clf = ZstdConcDictClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

P@1: 0.61391, P@2: 0.74615, AP: 0.51080


## Concentrated Dictionary (Features) (LZ4) Model

### Term Frequency (tf) Scoring

In [13]:
clf = LZ4ConcDictFeatureClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.59958, P@2: 0.73447, AP: 0.60044


### PPMI sublinear Scoring

In [14]:
clf = LZ4ConcDictFeatureClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.59320, P@2: 0.73340, AP: 0.59578


## Concentrated Dictionary (Features) (LZ77) Model

### Term Frequency (tf) Scoring

In [15]:
clf = LZ77ConcDictFeatureClassifier(EScoring.TF, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.60223, P@2: 0.74243, AP: 0.60914


### PPMI sublinear Scoring

In [16]:
clf = LZ77ConcDictFeatureClassifier(EScoring.PPMI_SUBLINEAR, dict_size=dict_size, k=k, segment_size=segment_size)
clf.fit(data.X_train, data.y_train)
y_score = clf.predict_proba(data.X_test)
print(f'P@1: {top_k_accuracy_score(data.y_test, y_score, k=1):.5f}, P@2: {top_k_accuracy_score(data.y_test, y_score, k=2):.5f}, AP: {average_precision_score(data.y_test, y_score):.5f}')

Best Params, C: 0.1
P@1: 0.60011, P@2: 0.72013, AP: 0.59295
