In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM nlp_articles WHERE datePublished<'2021-01-01' ORDER BY datePublished", 
                 sql, index_col="id", parse_dates=["datePublished"])

In [None]:
top_authors = df.groupby("author").count().sort_values("title", ascending=False).head(20)[["title"]]

In [None]:
min_articles = min(top_authors["title"])
adf = pd.concat([df[df["author"] == author].sample(min_articles, random_state=42)
                     for author in top_authors.index.values])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS as stop_words
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
tfidf_vectors = tfidf_vectorizer.fit_transform(adf["nav"])
tfidf_vectors

In [None]:
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(tfidf_vectors, adf["author"].values, 
                                                      train_size=0.75, random_state=42,
                                                      stratify=adf["author"].values)

In [None]:
from sklearn.svm import SVC
svm = SVC()
svc = svm.fit(X_train, y_train)

In [None]:
import numpy as np
pred_test  = svc.predict(X_test)
np.unique(pred_test == y_test, return_counts=True)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_test))

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

for clf_class in [SVC, SGDClassifier, MultinomialNB, DecisionTreeClassifier,
            RandomForestClassifier, GradientBoostingClassifier]:
    clf = clf_class(random_state=42) if clf_class != MultinomialNB else clf_class()
    clf.fit(X_train, y_train)
    pred_test  = clf.predict(X_test)
    print(clf_class.__name__)
    print(accuracy_score(y_test, pred_test), 
          precision_score(y_test, pred_test, average='weighted'), 
          recall_score(y_test, pred_test, average='weighted'))
    print("\n")

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
pred_test  = sgd.predict(X_test)
print(classification_report(y_test, pred_test))