In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM nlp_articles WHERE datePublished<'2021-01-01' ORDER BY datePublished", 
                 sql, index_col="id", parse_dates=["datePublished"])

In [None]:
top_authors = df.groupby("author").count().sort_values("title", ascending=False).head(20)[["title"]]

In [None]:
min_articles = min(top_authors["title"])

In [None]:
adf = pd.concat([df[df["author"] == author].sample(min_articles, random_state=42)
                     for author in top_authors.index.values])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS as stop_words
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
tfidf_vectors = tfidf_vectorizer.fit_transform(adf["nav"])
tfidf_vectors

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(tfidf_vectors, adf["author"])

In [None]:
adf["predicted_author"] = svc.predict(tfidf_vectors)

In [None]:
print(len(adf[adf["author"] == adf["predicted_author"]]))
print(len(adf[adf["author"] != adf["predicted_author"]]))

In [None]:
from collections import Counter
keywords = Counter([keyword for keywords in df["keywords"] for keyword in str(keywords).split(", ")])
top_keywords = [keyword[0] for keyword in keywords.most_common(20)]

In [None]:
for keyword in top_keywords:
    # DataFrame mit/ohne Keyword bestimmen
    k_pos = df[df["keywords"].map(str).str.contains(keyword)].copy()
    k_pos["keyword"] = 1
    k_neg = df[~ df["keywords"].map(str).str.contains(keyword)].copy()
    k_neg["keyword"] = 0
    
    # kleinste Länge ausrechnen
    min_keyword = min(len(k_pos), len(k_neg))
    kdf = pd.concat([k_pos.sample(min_keyword, random_state=42),
                     k_neg.sample(min_keyword, random_state=42)])
    
    # vektorisiere
    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
    tfidf_vectors = tfidf_vectorizer.fit_transform(kdf["nav"])
    
    # trainieren
    svc = SVC()
    svc.fit(tfidf_vectors, kdf["keyword"])
    
    # vorhersagen
    kdf["predicted_keyword"] = svc.predict(tfidf_vectors)
    
    # richtig/falsch berechnen
    print(keyword)
    print(len(kdf[kdf["keyword"] == kdf["predicted_keyword"]]))
    print(len(kdf[kdf["keyword"] != kdf["predicted_keyword"]]))

In [None]:
df["normalizedCommentCount"] = df["commentCount"].fillna(0).map(int)
df.loc[df["normalizedCommentCount"]>500, "normalizedCommentCount"] = 500

In [None]:
df_success = df[df["normalizedCommentCount"]>50].copy()
df_success["success"] = 1

df_no_success = df[df["normalizedCommentCount"]<10].copy()
df_no_success["success"] = 0

In [None]:
min_success = min(len(df_success), len(df_no_success))

In [None]:
sdf = pd.concat([df_success.sample(min_success, random_state=42),
                 df_no_success.sample(min_success, random_state=42)])

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
tfidf_vectors = tfidf_vectorizer.fit_transform(sdf["nav"])
tfidf_vectors

In [None]:
svc = SVC()
svc.fit(tfidf_vectors, sdf["success"])

In [None]:
sdf["predicted_success"] = svc.predict(tfidf_vectors)

In [None]:
print(len(sdf[sdf["success"] == sdf["predicted_success"]]))
print(len(sdf[sdf["success"] != sdf["predicted_success"]]))