In [1]:
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler

from feature import Length, AboveMean, Question, HasNum, NumEntities
from preprocessing import RemoveStopwords, Lemmatize, Lowercase
import pandas as pd
import argparse
import json
import pickle

def parse_args():
    parser = argparse.ArgumentParser(description='This is a baseline for task 1 that predicts that each clickbait post warrants a passage spoiler.')

    parser.add_argument('--input', type=str, help='The input data (expected in jsonl format).', required=True)
    parser.add_argument('--output', type=str, help='The classified output in jsonl format.', required=False)

    return parser.parse_args()

def load_input(df):
    with open(df, 'r') as inp:
         inp = [json.loads(i) for i in inp]
    return pd.DataFrame(inp)

def get_preprocessing_pipeline():
    pipeline = Pipeline([
        ("RemoveStopwords", RemoveStopwords()),
        ("Lemmatize", Lemmatize()),
        ("Lowercase", Lowercase())])
    return pipeline

def get_feature_pipeline():
    unigrams_feature = FeatureUnion(transformer_list=[("unigrams", CountVectorizer())])
    tfidf_feature = FeatureUnion(transformer_list=[("tf-idf", TfidfVectorizer(min_df=10, ngram_range=(1, 2)))])

    column_trans = ColumnTransformer(
        [
            ("unigrams", unigrams_feature, "text"),
            ("tfidf", tfidf_feature, "text"),

            ("Length", Length(), "text"),
            ("AboveMean", AboveMean(), "text"),
            ("Question", Question(), "text"),
            ("HasNum", HasNum(), "text"),
            ("NumEntities", NumEntities(), "text"),
        ],
        remainder="drop",
        verbose=True,
    )

    pipeline = Pipeline(
        [
            ("preprocessing", column_trans),
            ("classify", LogisticRegression(n_jobs=1, C=1e5)),
        ],
        verbose=True
    )
    return pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
input_file = "../Data/webis-clickbait-22/train.jsonl"
X = load_input(input_file)
y = X['tags'].explode()
X = X['postText'] + X['targetParagraphs']
X = X.apply(" ".join)
X = X.to_frame(name="text")

In [10]:
preprocessor = get_preprocessing_pipeline()
X["text"] = preprocessor.fit_transform(X["text"])

[nltk_data] Downloading package stopwords to /home/jueri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
pipeline = get_feature_pipeline()
pipeline.fit(X, y)

[ColumnTransformer] ...... (1 of 7) Processing unigrams, total=   0.8s
[ColumnTransformer] ......... (2 of 7) Processing tfidf, total=   3.6s
[ColumnTransformer] ........ (3 of 7) Processing Length, total=   0.0s
[ColumnTransformer] ..... (4 of 7) Processing AboveMean, total=   0.0s
[ColumnTransformer] ...... (5 of 7) Processing Question, total=   0.0s
[ColumnTransformer] ........ (6 of 7) Processing HasNum, total=   0.0s
[ColumnTransformer] ... (7 of 7) Processing NumEntities, total=  25.3s
[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=  32.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[Pipeline] .......... (step 2 of 2) Processing classify, total=   9.4s


In [15]:
evaluate = load_input("../Data/webis-clickbait-22/validation.jsonl")
y_evaluate = evaluate['tags'].explode()

In [16]:
X_evaluate = evaluate['postText'] + evaluate['targetParagraphs']
X_evaluate = X_evaluate.apply(" ".join)
X_evaluate = X_evaluate.to_frame(name="text")

In [31]:
with open("../preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

In [37]:
with open("../model.pkl", "rb") as f:
    pipeline = pickle.load(f)

In [34]:
X_evaluate["text"] = preprocessor.transform(X_evaluate["text"])

In [38]:
y_pred = pipeline.predict(X_evaluate)

In [41]:
from sklearn.metrics import f1_score

f1_score(y_evaluate, y_pred, average='micro')

0.46875