In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM nlp_articles WHERE datePublished<'2021-01-01' ORDER BY datePublished", 
                 sql, index_col="id", parse_dates=["datePublished"])

In [None]:
top_authors = df.groupby("author").count().sort_values("title", ascending=False).head(20)[["title"]]

min_articles = min(top_authors["title"])
adf = pd.concat([df[df["author"] == author].sample(min_articles, random_state=42)
                     for author in top_authors.index.values])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS as stop_words
from sklearn.linear_model import SGDClassifier

text_pipe = Pipeline([("vect", TfidfVectorizer(stop_words=stop_words)),
                     ("clf", SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42))
                     ])

In [None]:
parameters = {
    "vect__min_df": (2, 5, 10),
    "vect__ngram_range": ((1, 1), (1, 2)),  
    "vect__use_idf": (True, False), 
    "vect__sublinear_tf": (True, False),
    "clf__alpha": (0.0001, 0.0002)
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(text_pipe, parameters, n_jobs=-1, cv=3, verbose=1, return_train_score=True)
grid_search.fit(adf["nav"], adf["author"].values)

In [None]:
print("Bester Score (hier Accuracy): %0.3f" % grid_search.best_score_)

print("Bestes Parameter Set:")
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
pd.DataFrame(grid_search.cv_results_).sort_values("mean_test_score", ascending=False)

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(text_pipe, parameters, n_jobs=-1, cv=3, verbose=1, return_train_score=True)
grid_search.fit(adf["full_text"], adf["author"].values)

In [None]:
print("Bester Score (hier Accuracy): %0.3f" % grid_search.best_score_)

print("Bestes Parameter Set:")
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=10, sublinear_tf=True, use_idf=False)
tfidf_vectors = tfidf_vectorizer.fit_transform(adf["full_text"])

(X_train, X_test, y_train, y_test) = train_test_split(tfidf_vectors, adf["author"].values, 
                                                      train_size=0.75, random_state=42,
                                                      stratify=adf["author"].values)

clf = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, alpha=0.0002, random_state=42)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

pred_test = clf.predict(X_test)
print(classification_report(y_test, pred_test))