In [18]:
%load_ext autoreload
%autoreload 2

In [1]:
!pip install nltk
!pip install mosestokenizer

You should consider upgrading via the '/lnet/aic/personal/kydliceh/Articles_Analysis/venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/lnet/aic/personal/kydliceh/Articles_Analysis/venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from tqdm import tqdm
import pandas as pd
import re
from datetime import datetime
import pytz
import functools
import matplotlib.pyplot as plt
import numpy as np

In [3]:

used_cols = ["article length", "headline length", "brief length", "num words", "avg word length", "num of non-alphanumeric"]
plot_cols = 2
plot_rows = 3
fig_size = (20, 10)


def create_hist_plots(df: pd.DataFrame):
    fig, axes = plt.subplots(plot_cols, plot_rows, figsize=fig_size)
    fig.suptitle(f"{df.Name} histogram plots")
    for row in range(plot_rows):
        for col in range(plot_cols):
            ax = axes[col][row]
            df[used_cols[col*plot_rows + row]].hist(ax=ax, bins=150, legend=True)


def create_whisker_plots(df: pd.DataFrame):
    fig, axes = plt.subplots(plot_cols, plot_rows, figsize=fig_size)
    fig.suptitle(f"{df.Name} whisker plots")
    for row in range(plot_rows):
        for col in range(plot_cols):
            ax = axes[col][row]
            df.boxplot(used_cols[col*plot_rows + row],ax=ax)


def create_date_plot(df: pd.DataFrame):
    fig, axes = plt.subplots(plot_cols, plot_rows, figsize=fig_size)
    fig.suptitle(f"{df.Name} date plots")
    groupby_date = df.groupby("date")
    groupby_date["url"].count().plot(ax=axes[0][0], legend=True)
    groupby_date["article length"].mean().plot.area(ax=axes[0][1], legend=True)
    groupby_date["headline length"].mean().plot.area(ax=axes[0][2], legend=True)
    groupby_date["brief length"].mean().plot.area(ax=axes[1][0], legend = True)
    groupby_date["num words"].mean().plot.area(ax=axes[1][1], legend = True)
    groupby_date["avg word length"].mean().plot.area(ax=axes[1][2], legend = True)



In [4]:

def create_exploratory_plots(df):
    create_hist_plots(df)
    create_whisker_plots(df)
    create_date_plot(df)

In [5]:

from preprocess_utils import num_of_lines, load_jsonb
from mosestokenizer import MosesTokenizer
moses = MosesTokenizer("cz")
toktok = ToktokTokenizer()

def flatten(l):
    return [item for sublist in l for item in sublist]

@functools.cache
def create_df(file):
    length = num_of_lines(file)
    header = ["url", "article length", "headline length", "brief length", "num words", "avg word length", "num of non-alphanumeric", "date", "comments_num"]
    l = []
    for js in tqdm(load_jsonb(file), total=length):
        url = js["url"]
        article = js["content"].strip()
        brief = js["brief"].strip()
        headline = js["headline"].strip()
        article_length = len(article)
        brief_length = len(brief)
        headline_length = len(headline)

        #Tok tok is speedy unlike the others
        tokenized = toktok.tokenize(article)
        num_words = len(tokenized)
        avg_word_length = get_average_word_length(tokenized)
        non_alpha = count_non_alpha(tokenized)
        date = datetime.fromisoformat(js["publication_date"]) if js["publication_date"] != None else None
        if date != None and date.tzinfo == None:
            date = date.replace(tzinfo=pytz.UTC)
        comments_num = js["comments_num"]
        l.append([url, article_length, headline_length, brief_length, num_words, avg_word_length, non_alpha, date, comments_num ])
    
    df = pd.DataFrame(l,columns=header)
    df.Name = file.name
    
    return df


def get_average_word_length(tokenized_article):
    return sum([len(x) for x in tokenized_article])/len(tokenized_article)


def count_non_alpha(article):
    # Should new line also count ?
    return sum([1 for char in article if not char.isalnum() and not char.isspace()])



    

    



In [6]:

from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

In [7]:

from pathlib import Path
from dataclasses import dataclass

author_folder = Path("article_json_author")
possible_files = [None] + [x for x  in author_folder.iterdir() if x.is_file()]
modes = ["explore", "best_authors"]

class CachedData:
    def __init__(self, file: Path | None):
        self.file = file
        self.df = None
        if file != None:
            self.df = create_df(file)

    def update(self, file: Path):
        if file != self.file and file != None:
            self.df = create_df(file)

cached_df = CachedData(None)

def best_x_authors(file, num):
    auths = get_unique_authors(file)
    sorted_auths = sorted(auths.items(), key=lambda x: x[1], reverse=True)
    print(f"Total authors: {len(sorted_auths)}")
    print(f"Top {num} authors: ")
    for i in range(num):
        print(sorted_auths[i])


def interact_main(mode, file):
    if file == None:
        return

    if mode == "best_authors":
        best_x_authors(file, 100)

    if mode == "explore":
        cached_df.update(file)
        df = cached_df.df
        create_exploratory_plots(df)


interactive_plot = interactive(interact_main, mode=modes, file=possible_files)
display(interactive_plot)

interactive(children=(Dropdown(description='mode', options=('explore', 'best_authors'), value='explore'), Drop…

In [75]:
from preprocess_utils import show_outlier_by_percentiles, show_df_lines, show_outliers, pick_indexes
# IROZHLAS INSPECTION

df = show_outlier_by_percentiles(create_df(possible_files[1]), "headline length", 0.99, limit=50)
#df = show_outliers(create_df(possible_files[1]), "headline length", 60, "lower", limit=20, random=False)
mod = lambda art : art["headline"]
show_df_lines(df, possible_files[1] ,mod)

Přísahá, že nerozdělí národ, a hned ve druhé větě to poruší? Toho nebudu svědkem, vysvětlila Němcová odchod
Češi žijí déle a v důchodu tráví přes 24 let. Dnešní dvacátníci mají podle úřadu s prací končit v 67 letech
VIDEO: Jestli si to veřejnost přeje, tak klidně rezignuji, řekl Prymula ke schůzce s Faltýnkem v restauraci
140 kilometrů v hodině, hlavou napřed. Fernstädtovou čeká skeletonový šampionát na nejrychlejší dráze světa
V Británii draží klíč k místnosti, kde zemřel Napoleon. Z paláce na Svaté Heleně ho přivezl voják pro matku
Nedostatek toaletního papíru ukázal krizi centrálního plánování. Nejhůře bylo po požáru papíren v roce 1988
Choupenitchova matka: Synovy zápasy jsme neviděli, běloruská televize je nevysílala. Modlila jsem se za něj
Ahoj, tady Navalnyj, píše kritik Kremlu k první fotce z nemocnice. Dýchá bez přístrojů a občas opustí lůžko
Indonéské Borneo přišlo o celou čtvrtinu deštných pralesů za jediný rok. Kvůli palmovému oleji, říká ekolog
Nizozemská firma chce po Čes

In [None]:
#df = show_outliers(create_df(possible_files[1]), "headline length", 60, "lower", limit=20, random=False)

Unnamed: 0_level_0,url,article length,brief length,num words,avg word length,num of non-alphanumeric,date,comments_num
headline length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5,5,5,5,5,5,5,0
2,9,9,9,9,9,9,9,0
3,20,20,20,20,20,20,20,0
4,28,28,28,28,28,28,28,0
5,39,39,39,39,39,39,39,0
6,42,42,42,42,42,42,42,0
7,49,49,49,49,49,49,49,0
8,58,58,58,58,58,58,58,0
9,57,57,57,57,57,57,57,0
10,67,67,67,67,67,67,67,0
