In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

In [2]:
def load_dataset(filepath):
    with open(filepath, "r") as f:
        dataset = [ json.loads(line, parse_int = str) for line in f ]
    return dataset

def get_vectorizer( texts: list[str], *, method="countvectorize", **kwargs ) -> CountVectorizer:
    """From a list of texts, output an appropriate vectorizer either using CountVectorizer or TF-IDF depending on method argument. 

     Args:
         texts (list[str]): list of strings, each item corresponding to a text.
         method (str, optional): Method to select features. Defaults to "count-vectorizer".
         **kwargs: kwarg arguments to pass to Vectorizer classes of sklearn.
    Raises:
        ValueError: If passing an non-specified method of text feature extraction

     Returns:
         pd.DataFrame: dataframe of shape (n_samples, n_features)
    """
    #We want single digits to be tokenized. This regex considers everything as a token except whitespace.
    kwargs['token_pattern'] = r'\S+' 
    if method == "countvectorize":
        vectorizer = CountVectorizer(**kwargs)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(**kwargs)
    else:
        raise ValueError(f"{method} is not a supported method.")
    #Use texts to initialize vocabulary of vectorizer
    vectorizer.fit(texts)
    return vectorizer

def sentencify(text: list) -> str:
    sentence = " ".join(text) 
    return sentence

def loss(clf, X, y):
    probs = clf.predict_log_proba(X)
    y0 = probs[:,1]
    y1 = probs[:,0]
    loss = -y*y0 - (1-y)*y1
    loss = loss.sum()/y.size
    return loss

def balanced_acc(clf, X, y):
    y_pred = clf.predict(X)
    return balanced_accuracy_score(y, y_pred, adjusted=False)

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [3]:
dataset_2 = load_dataset("domain2_train_data.json")
datatexts_2 = [ sentencify(instance['text']) for instance in dataset_2 ]

In [4]:
tfidf_vectorizer = get_vectorizer( texts = datatexts_2,
                                    method='tfidf',
                                    use_idf=True,
                                    ngram_range=(1,3),
                                    max_df=0.999, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                    min_df=5, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                    )
print(f"no features: {tfidf_vectorizer.get_feature_names_out().size}")

no features: 162886


In [5]:
X2 = tfidf_vectorizer.transform( datatexts_2 ).toarray()
y = np.array([1]*1500 + [0]*11500)
lengths = np.zeros( (13000, 2) )
for i in range(13000):
    for k in range(2):
        lengths[i,k] = len(dataset_2[i]['text'])**(k+1)
lengths[:,0] /= lengths.max(axis=0)[0]
lengths[:,1] /= lengths.max(axis=0)[1]
X2 = np.hstack( (X2, lengths) )

In [6]:
selector_clf = LogisticRegression(C=0.01, random_state=0)
selector_clf.fit(X2, y)
selector = SelectFromModel(selector_clf, prefit=True)
selector.fit(X2, y)

In [9]:
pipeline = Pipeline( [("selector", ),
                      ("clf", LogisticRegression(C=1e-2, tol=1e-6, random_state=1, class_weight="balanced"))] )

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_idces, test_idces in cv.split(X2, y):
    X_train, y_train = X2[train_idces,:], y[train_idces]
    X_test, y_test = X2[test_idces,:], y[test_idces]
    pipeline.fit(X_train, y_train)
    mnb = balanced_acc( pipeline, X_test, y_test )
    print(mnb)

0.8796376811594202


In [None]:
pipeline.fit(X2, y)

In [None]:
import pickle as pkl
with open("domain2_lr.mdl", "wb") as f:
    pkl.dump( [tfidf_vectorizer, pipeline], f )