In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM nlp_articles WHERE datePublished<'2021-01-01' ORDER BY datePublished", 
                 sql, index_col="id", parse_dates=["datePublished"])

In [None]:
top_authors = df.groupby("author").count().sort_values("title", ascending=False).head(20)[["title"]]

In [None]:
min_articles = min(top_authors["title"])
adf = pd.concat([df[df["author"] == author].sample(min_articles, random_state=42)
                     for author in top_authors.index.values])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS as stop_words
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
tfidf_vectors = tfidf_vectorizer.fit_transform(adf["nav"])
tfidf_vectors

In [None]:
from sklearn.model_selection import train_test_split
(X_train, X_test, y_train, y_test) = train_test_split(tfidf_vectors, adf["author"].values, 
                                                      train_size=0.75, random_state=42,
                                                      stratify=adf["author"].values)

In [None]:
from sklearn.svm import SVC
svm = SVC()
svc = svm.fit(X_train, y_train)

In [None]:
pred_train = svc.predict(X_train)
pred_test  = svc.predict(X_test)

In [None]:
import numpy as np
np.unique(pred_train == y_train, return_counts=True)

In [None]:
np.unique(pred_test == y_test, return_counts=True)

In [None]:
pt = pd.DataFrame(pred_test, columns=["author"])
pt["count"] = 1
pt.groupby("author").count().plot.barh()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_test, labels=top_authors.index.values)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, pred_test, labels=top_authors.index.values), 
            xticklabels=top_authors.index.values, yticklabels=top_authors.index.values)