In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

In [2]:
def load_dataset(filepath):
    with open(filepath, "r") as f:
        dataset = [ json.loads(line, parse_int = str) for line in f ]
    return dataset

def get_vectorizer( texts: list[str], *, method="countvectorize", **kwargs ) -> CountVectorizer:
    """From a list of texts, output an appropriate vectorizer either using CountVectorizer or TF-IDF depending on method argument. 

     Args:
         texts (list[str]): list of strings, each item corresponding to a text.
         method (str, optional): Method to select features. Defaults to "count-vectorizer".
         **kwargs: kwarg arguments to pass to Vectorizer classes of sklearn.
    Raises:
        ValueError: If passing an non-specified method of text feature extraction

     Returns:
         pd.DataFrame: dataframe of shape (n_samples, n_features)
    """
    #We want single digits to be tokenized. This regex considers everything as a token except whitespace.
    kwargs['token_pattern'] = r'\S+' 
    if method == "countvectorize":
        vectorizer = CountVectorizer(**kwargs)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(**kwargs)
    else:
        raise ValueError(f"{method} is not a supported method.")
    #Use texts to initialize vocabulary of vectorizer
    vectorizer.fit(texts)
    return vectorizer

def sentencify(text: list) -> str:
    sentence = " ".join(text) 
    return sentence

def loss(clf, X, y):
    probs = clf.predict_log_proba(X)
    y0 = probs[:,1]
    y1 = probs[:,0]
    loss = -y*y0 - (1-y)*y1
    loss = loss.sum()/y.size
    return loss

def balanced_acc(clf, X, y):
    y_pred = clf.predict(X)
    return balanced_accuracy_score(y, y_pred, adjusted=False)

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [3]:
dataset_1 = load_dataset("domain1_train_data.json")
dataset_2 = load_dataset("domain2_train_data.json")
datatexts_1 = [ sentencify(instance['text']) for instance in dataset_1 ]
datatexts_2 = [ sentencify(instance['text']) for instance in dataset_2 ]

In [4]:
tfidf_vectorizer = get_vectorizer( texts = datatexts_1+datatexts_2,
                                    method='tfidf',
                                    use_idf=True,
                                    ngram_range=(1,3),
                                    max_df=0.995, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                    min_df=10, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                    )
print(f"no features: {tfidf_vectorizer.get_feature_names_out().size}")

no features: 110827


In [5]:
labels = {0:"domain1_ai",
          1:"domain1_human",
          2:"domain2_ai",
          3:"domain2_human"}

In [6]:
X = tfidf_vectorizer.transform( datatexts_1 + datatexts_2 ).toarray()
y = np.array([1]*2500 + [0]*2500 + [3]*1500 + [2]*11500)
lengths = np.zeros( (18000, 2) )
for i in range(5000):
    for k in range(2):
        lengths[i,k] = len(dataset_1[i]['text'])**(k+1)
for i in range(13000):
    for k in range(2):
        lengths[5000+i,k] = len(dataset_2[i]['text'])**(k+1)
lengths[:,0] /= lengths.max(axis=0)[0]
lengths[:,1] /= lengths.max(axis=0)[1]
X = np.hstack( (X, lengths) )

In [7]:
# selector_clf = LogisticRegression(C=0.1, random_state=0)
# selector_clf.fit(X, y)

# import pickle as pkl
# with open("temp_selector_clf.mdl", "wb") as f:
#     pkl.dump(selector_clf, f)

import pickle as pkl
with open("temp_selector_clf.mdl", "rb") as f:
    selector_clf = pkl.load(f)

In [8]:
selector = SelectFromModel(selector_clf, prefit=True)
selector.fit(X, y)
selector.get_feature_names_out().size

26799

In [9]:
X = selector.transform(X)

In [10]:
X.shape

(18000, 26799)

In [19]:
clf = LogisticRegression(C=10.0, class_weight="balanced", random_state=0, tol=1e-4, n_jobs=-1)

In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_idces, test_idces in cv.split(X, y):
    X_train, y_train = X[train_idces,:], y[train_idces]
    X_test, y_test = X[test_idces,:], y[test_idces]
    clf.fit(X_train, y_train)
    mnb = balanced_acc( clf, X_test, y_test )
    print(mnb)

0.8632608695652174


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8677028985507246
0.8664420289855073


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8802536231884057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8722463768115941


In [21]:
clf.fit(X, y)

In [22]:
pipeline = Pipeline([ ("selector", selector), ("clf", clf) ])

In [23]:
import pickle as pkl
with open("domain12_LR.mdl", "wb") as f:
    pkl.dump( [tfidf_vectorizer, pipeline], f )