In [29]:
import re
import pickle

import spacy
import numpy as np
import pandas as pd

from src.cleaner.model import Cleaner
import plotly.express as px


In [30]:
import plotly.io as pio

pio.renderers.default = "notebook_connected"


In [31]:
df = pd.read_parquet("../data/processed/scrapped_cleaned_valid_and_fake_balanced.gzip")


In [4]:
df["date"] = [pd.Timestamp(x) for x in df["date"]]


In [4]:
import json
import pydantic


class CeleryConfig(pydantic.BaseModel):
    pass


class SiteConfig(pydantic.BaseModel):
    url: str
    type: str
    routes: list[str]
    confidence: float


class MongoDB(pydantic.BaseModel):
    pass


class Config(pydantic.BaseModel):
    mongodb: MongoDB
    celery: CeleryConfig
    site: list[SiteConfig]


with open("../cluster/configs/celery-beat/config.json", "r") as f:
    sites_cfg = Config.parse_raw(json.dumps(dict(json.load(f))))

sites_cfg_site_routes = {}
for site in sites_cfg.site:
    sites_cfg_site_routes[site.confidence] = set(site.routes)


def valid_or_fake(domain_name: str):
    for confidence, domains in sites_cfg_site_routes.items():
        if domain_name in domains:
            if confidence >= 0.5:
                return True
            else:
                return False


# Words and sentences frequencies and count

In [16]:
domain_number = []

for domain in df["domain"].unique():
    domain_number.append(
        {
            "domain": domain,
            "length": len(df[df.domain == domain]),
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(domain_number, x="domain", y="length", color="type", text_auto=".2s")
fig.write_html(f"../visualization/charts/article_count_per_domain.html")
fig.write_image(f"../visualization/charts/article_count_per_domain.png")


In [17]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        domain_number.append(
            {
                "domain": domain,
                "length": len(df_[df_.domain == domain]),
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(domain_number, x="domain", y="length", color="year", text_auto=".2s")
fig.write_html(f"../visualization/charts/article_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts/article_count_per_domain_per_year.png")


In [18]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        domain_number.append(
            {
                "domain": domain,
                "length": len(df_[df_.domain == domain]),
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_number, x="domain", y="length", color="year", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/article_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/article_count_per_domain_per_year_groupbar.png")


word count by domain

In [19]:
domain_word_count = []
for domain in df["domain"].unique():
    word_count: int = 0
    for news in df[df.domain == domain]["content"]:
        word_count += len(news.split(" "))
    domain_word_count.append(
        {
            "domain": domain,
            "word_count": word_count,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(domain_word_count, x="domain", y="word_count", color="type", text_auto=".2s")
fig.write_html(f"../visualization/charts/word_count_per_domain.html")
fig.write_image(f"../visualization/charts/word_count_per_domain.png")


In [20]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        word_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            word_count += len(news.split(" "))
        domain_number.append(
            {
                "domain": domain,
                "word_count": word_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(domain_number, x="domain", y="word_count", color="year", text_auto=".2s")
fig.write_html(f"../visualization/charts/word_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts/word_count_per_domain_per_year.png")


In [21]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        word_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            word_count += len(news.split(" "))
        domain_number.append(
            {
                "domain": domain,
                "word_count": word_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_number, x="domain", y="word_count", color="year", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/word_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/word_count_per_domain_per_year_groupbar.png")


sentence count by domain

In [22]:
domain_sentence_count = []
for domain in df["domain"].unique():
    sentence_count: int = 0
    for news in df[df.domain == domain]["content"]:
        sentence_count += len(news.split("."))
    domain_sentence_count.append(
        {
            "domain": domain,
            "sentence_count": sentence_count,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(
    domain_sentence_count, x="domain", y="sentence_count", color="type", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/sentence_count_per_domain.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain.png")


In [23]:
domain_sentence_count = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        sentence_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            sentence_count += len(news.split("."))
        domain_sentence_count.append(
            {
                "domain": domain,
                "sentence_count": sentence_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_sentence_count,
    x="domain",
    y="sentence_count",
    color="year",
    text_auto=".2s",
    barmode="group",
)
fig.write_html(f"../visualization/charts/sentence_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain_per_year_groupbar.png")


In [24]:
domain_sentence_count = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        sentence_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            sentence_count += len(news.split("."))
        domain_sentence_count.append(
            {
                "domain": domain,
                "sentence_count": sentence_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_sentence_count, x="domain", y="sentence_count", color="year", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/sentence_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain_per_year.png")


average word length by domain

In [25]:
domain_word_count = []
for domain in df["domain"].unique():
    word_count: list[int] = []
    for news in df[df.domain == domain]["content"]:
        word_count.append(np.mean([len(news) for news in news.split(" ")]))
    domain_word_count.append(
        {
            "domain": domain,
            "word_length_average": np.mean(word_count),
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(
    domain_word_count, x="domain", y="word_length_average", color="type", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/average_word_length_by_domain.html")
fig.write_image(f"../visualization/charts/average_word_length_by_domain.png")


average sentence length by domain

In [26]:
domain_word_count = []
for domain in df["domain"].unique():
    sentence_average: list[int] = []
    for news in df[df.domain == domain]["content"]:
        sentence_average.append(np.mean([len(news) for news in news.split(".")]))
    domain_word_count.append(
        {
            "domain": domain,
            "sentence_length_average": np.mean(sentence_average),
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(
    domain_word_count, x="domain", y="sentence_length_average", color="type", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/average_sentence_length_by_domain.html")
fig.write_image(f"../visualization/charts/average_sentence_length_by_domain.png")


sentiment analysis per domains

```
pip install -U pip setuptools wheel
pip install -U 'spacy[cuda113]'
python -m spacy ro_core_news_sm
```

# NER by domain

In [5]:
df["domain"].unique()

array(['stiri/actualitate', 'stiri/externe', 'stiri/actualitate/politica',
       'stiri/economie', 'stiri/sci-tech', 'stiri/sport',
       'category/politica/', 'monden/', 'category/news/',
       'category/externe/', 'politic/', 'category/opinii/', 'life-death/',
       'category/stiri-politica/', 'it-stiinta/',
       'category/stiri-sanatate-stiri-fitness-stiri-slabire/',
       'category/analize/', 'category/stirile-zilei/',
       'category/stiri-internationale/', 'stiri/', 'category/life/',
       'covid/', 'sport/', '', 'category/stiri-de-pe-mapamond/',
       'category/utile/', 'category/timp-liber/', 'category/money/',
       'category/sanatate-si-medicina/',
       'category/mistere-de-pe-mapamond/', 'category/stiri-din-romania/',
       'category/actualitate/', 'category/curiozitati-staiti-ca/'],
      dtype=object)

In [8]:
import tqdm
import gzip
import pickle

nlp = spacy.load("ro_core_news_sm")

for domain in [
    "stiri/economie",
    "stiri/sci-tech",
    "stiri/sport",
    "category/politica/",
    "monden/",
    "category/news/",
    "category/externe/",
    "politic/",
    "category/opinii/",
    "life-death/",
    "category/stiri-politica/",
    "it-stiinta/",
    "category/stiri-sanatate-stiri-fitness-stiri-slabire/",
    "category/analize/",
    "category/stirile-zilei/",
    "category/stiri-internationale/",
    "stiri/",
    "category/life/",
    "covid/",
    "sport/",
    "",
    "category/stiri-de-pe-mapamond/",
    "category/utile/",
    "category/timp-liber/",
    "category/money/",
    "category/sanatate-si-medicina/",
    "category/mistere-de-pe-mapamond/",
    "category/stiri-din-romania/",
    "category/actualitate/",
    "category/curiozitati-staiti-ca/",
]:  # df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    print(domain)

    docs = []

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        docs.append(nlp(news))

    domain_dict = {
        "domain": domain,
        "docs": docs,
    }

    with gzip.open(
        f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}_balanced.gzip", "wb"
    ) as o:
        pickle.dump(domain_dict, o)


stiri/economie


100%|██████████| 8388/8388 [13:16<00:00, 10.54it/s]


stiri/sci-tech


100%|██████████| 822/822 [01:15<00:00, 10.83it/s]


stiri/sport


100%|██████████| 1500/1500 [02:12<00:00, 11.33it/s]


category/politica/


100%|██████████| 19032/19032 [24:52<00:00, 12.75it/s] 


monden/


100%|██████████| 2779/2779 [01:51<00:00, 25.02it/s]


category/news/


100%|██████████| 11063/11063 [13:37<00:00, 13.52it/s]


category/externe/


100%|██████████| 2970/2970 [03:02<00:00, 16.23it/s]


politic/


100%|██████████| 1628/1628 [01:03<00:00, 25.57it/s]


category/opinii/


100%|██████████| 126/126 [00:14<00:00,  8.70it/s]


life-death/


100%|██████████| 2227/2227 [01:25<00:00, 26.04it/s]


category/stiri-politica/


100%|██████████| 288/288 [00:19<00:00, 14.87it/s]


it-stiinta/


100%|██████████| 807/807 [00:31<00:00, 25.64it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 569/569 [00:37<00:00, 15.14it/s]


category/analize/


100%|██████████| 711/711 [01:21<00:00,  8.75it/s]


category/stirile-zilei/


100%|██████████| 1718/1718 [01:42<00:00, 16.83it/s]


category/stiri-internationale/


100%|██████████| 277/277 [00:16<00:00, 16.84it/s]


stiri/


100%|██████████| 2865/2865 [01:34<00:00, 30.23it/s]


category/life/


100%|██████████| 284/284 [00:15<00:00, 18.47it/s]


covid/


100%|██████████| 188/188 [00:05<00:00, 35.45it/s]


sport/


100%|██████████| 339/339 [00:13<00:00, 25.72it/s]





100%|██████████| 672/672 [00:39<00:00, 16.99it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 19.98it/s]


category/utile/


100%|██████████| 337/337 [00:21<00:00, 15.74it/s]


category/timp-liber/


100%|██████████| 154/154 [00:12<00:00, 11.93it/s]


category/money/


100%|██████████| 125/125 [00:09<00:00, 12.85it/s]


category/sanatate-si-medicina/


100%|██████████| 2/2 [00:00<00:00,  7.88it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 2/2 [00:00<00:00,  6.33it/s]


category/stiri-din-romania/


100%|██████████| 4/4 [00:00<00:00, 22.07it/s]


category/actualitate/


100%|██████████| 4/4 [00:00<00:00, 30.30it/s]


category/curiozitati-staiti-ca/


100%|██████████| 4/4 [00:00<00:00,  9.92it/s]


In [9]:
import tqdm
import gzip
import pickle

for domain in df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}_balanced.gzip", "rb") as o:
        domain_dict = pickle.load(o)

    print(domain)

    for doc in tqdm.tqdm(domain_dict["docs"]):
        for ent in doc.ents:

            if ent.text not in ner_object_frequecy.keys():
                ner_object_frequecy[ent.text] = 1
            else:
                ner_object_frequecy[ent.text] += 1

            if ent.label_ not in ner_tag_object.keys():
                ner_tag_object[ent.label_] = [ent.text]
            else:
                ner_tag_object[ent.label_].append(ent.text)

    with gzip.open(f"../models/pickles/ner_per_domain_{domain.replace('/', '_')}_balanced.gzip", "wb") as o:
        pickle.dump(
            {
                "domain": domain,
                "ner_object_frequecy": ner_object_frequecy,
                "ner_tag_object": ner_tag_object,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
            },
            o,
        )


stiri/actualitate


100%|██████████| 14727/14727 [00:06<00:00, 2330.84it/s]


stiri/externe


100%|██████████| 8922/8922 [00:03<00:00, 2344.09it/s]


stiri/actualitate/politica


100%|██████████| 13476/13476 [00:05<00:00, 2392.08it/s]


stiri/economie


100%|██████████| 8388/8388 [00:03<00:00, 2286.69it/s]


stiri/sci-tech


100%|██████████| 822/822 [00:00<00:00, 2212.74it/s]


stiri/sport


100%|██████████| 1500/1500 [00:00<00:00, 1586.10it/s]


category/politica/


100%|██████████| 19032/19032 [00:08<00:00, 2310.88it/s]


monden/


100%|██████████| 2779/2779 [00:00<00:00, 4965.02it/s]


category/news/


100%|██████████| 11063/11063 [00:04<00:00, 2453.25it/s]


category/externe/


100%|██████████| 2970/2970 [00:01<00:00, 2628.29it/s]


politic/


100%|██████████| 1628/1628 [00:00<00:00, 4599.02it/s]


category/opinii/


100%|██████████| 126/126 [00:00<00:00, 1315.10it/s]


life-death/


100%|██████████| 2227/2227 [00:00<00:00, 5334.77it/s]


category/stiri-politica/


100%|██████████| 288/288 [00:00<00:00, 1711.16it/s]


it-stiinta/


100%|██████████| 807/807 [00:00<00:00, 3411.86it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 569/569 [00:00<00:00, 3182.28it/s]


category/analize/


100%|██████████| 711/711 [00:00<00:00, 1624.33it/s]


category/stirile-zilei/


100%|██████████| 1718/1718 [00:00<00:00, 2884.92it/s]


category/stiri-internationale/


100%|██████████| 277/277 [00:00<00:00, 1962.31it/s]


stiri/


100%|██████████| 2865/2865 [00:00<00:00, 5175.94it/s]


category/life/


100%|██████████| 284/284 [00:00<00:00, 2017.61it/s]


covid/


100%|██████████| 188/188 [00:00<00:00, 2390.29it/s]


sport/


100%|██████████| 339/339 [00:00<00:00, 2805.70it/s]





100%|██████████| 672/672 [00:00<00:00, 1641.30it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 82.85it/s]


category/utile/


100%|██████████| 337/337 [00:00<00:00, 2970.52it/s]


category/timp-liber/


100%|██████████| 154/154 [00:00<00:00, 1371.23it/s]


category/money/


100%|██████████| 125/125 [00:00<00:00, 1359.34it/s]


category/sanatate-si-medicina/


100%|██████████| 2/2 [00:00<00:00, 36.06it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 2/2 [00:00<00:00, 45.09it/s]


category/stiri-din-romania/


100%|██████████| 4/4 [00:00<00:00, 93.29it/s]


category/actualitate/


100%|██████████| 4/4 [00:00<00:00, 87.24it/s]


category/curiozitati-staiti-ca/


100%|██████████| 4/4 [00:00<00:00, 80.05it/s]


## Plot NER

In [10]:
import gzip
import pickle

domain_word_count = []
for domain in df["domain"].unique():
    with gzip.open(f"../models/pickles/ner_per_domain_{domain.replace('/', '_')}_balanced.gzip", "rb") as o:
        domain_dict = pickle.load(o)

    domain_word_count.append(domain_dict)


In [11]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )

fig = px.histogram(domains, x="tag", y="tag_length", color="domain", text_auto=".2s")
fig.write_html(f"../visualization/charts/NER_domain_count_per_tag.html")
fig.write_image(f"../visualization/charts/NER_domain_count_per_tag.png")


In [12]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )

fig = px.histogram(domains, x="domain", y="tag_length", color="tag", text_auto=".2s")
fig.write_html(f"../visualization/charts/NER_tag_count_per_domain.html")
fig.write_image(f"../visualization/charts/NER_tag_count_per_domain.png")


In [13]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )

fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/NER_tag_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/NER_tag_count_per_domain_groupbar.png")


In [14]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(domains, x="domain", y="word_freq", color="word", text_auto=".2s")
fig.write_html(f"../visualization/charts/NER_word_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/NER_word_count_per_domain_groupbar.png")


In [15]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["ner_tag_object"]:
            if word in domain["ner_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break


fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/NER_top_10_words_by_frequency_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/NER_top_10_words_by_frequency_per_domain_groupbar.png")


In [16]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["ner_tag_object"]:
            if word in domain["ner_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(
    domains, x="domain", y="word_freq", color="tag", text_auto=".2s", barmode="group"
)

fig.write_html(f"../visualization/charts/NER_top_10_tags_by_frequency_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/NER_top_10_tags_by_frequency_per_domain_groupbar.png")


# N-grams

In [None]:
# TODO ?


# Keyword extraction

wait for domain_docs execution

In [17]:
import gzip
import tqdm
import spacy
import pickle
import string
import gc


In [18]:
import json
import pydantic


class CeleryConfig(pydantic.BaseModel):
    pass


class SiteConfig(pydantic.BaseModel):
    url: str
    type: str
    routes: list[str]
    confidence: float


class MongoDB(pydantic.BaseModel):
    pass


class Config(pydantic.BaseModel):
    mongodb: MongoDB
    celery: CeleryConfig
    site: list[SiteConfig]


with open("../cluster/configs/celery-beat/config.json", "r") as f:
    sites_cfg = Config.parse_raw(json.dumps(dict(json.load(f))))

sites_cfg_site_routes = {}
for site in sites_cfg.site:
    sites_cfg_site_routes[site.confidence] = set(site.routes)


def valid_or_fake(domain_name: str):
    for confidence, domains in sites_cfg_site_routes.items():
        if domain_name in domains:
            if confidence >= 0.5:
                return True
            else:
                return False


In [20]:
nlp = spacy.load("ro_core_news_sm")
pos_tag = ["PROPN", "ADJ", "NOUN"]

domain_word_count1 = []
for domain in df["domain"].unique():

    print(domain)

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}_balanced.gzip", "rb") as o:
        domain_word_count = pickle.load(o)

    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    for doc in tqdm.tqdm(domain_word_count["docs"]):
        for doc, ent in zip(doc, doc.ents):

            if doc.text in nlp.Defaults.stop_words or doc.text in string.punctuation:
                continue

            if doc.pos_ in pos_tag:
                if ent.text not in ner_object_frequecy.keys():
                    ner_object_frequecy[ent.text] = 1
                else:
                    ner_object_frequecy[ent.text] += 1

                if ent.label_ not in ner_tag_object.keys():
                    ner_tag_object[ent.label_] = [ent.text]
                else:
                    ner_tag_object[ent.label_].append(ent.text)

    sorted_tags = dict(sorted(ner_object_frequecy.items(), key=lambda item: item[1], reverse=True))
    index: int = 10

    del domain_word_count

    gc.collect()

    domains = []
    for word in sorted_tags:
        tag_: str = ""
        for tag in ner_tag_object:
            if word in ner_tag_object[tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "word": word,
                "word_freq": ner_object_frequecy[word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

    fig = px.histogram(
        domains, x="word", y="word_freq", color="tag", text_auto=".2s", barmode="group"
    )
    fig.write_html(f"../visualization/charts/KeyWord_{domain.replace('/', '_')}_NER_tag.html")
    fig.write_image(f"../visualization/charts/KeyWord_{domain.replace('/', '_')}_NER_tag.png")

    del fig
    gc.collect()


stiri/actualitate


100%|██████████| 14727/14727 [00:03<00:00, 3827.54it/s]


stiri/externe


100%|██████████| 8922/8922 [00:02<00:00, 3670.64it/s]


stiri/actualitate/politica


100%|██████████| 13476/13476 [00:03<00:00, 3896.38it/s]


stiri/economie


100%|██████████| 8388/8388 [00:02<00:00, 3323.20it/s]


stiri/sci-tech


100%|██████████| 822/822 [00:00<00:00, 3247.30it/s]


stiri/sport


100%|██████████| 1500/1500 [00:00<00:00, 2339.59it/s]


category/politica/


100%|██████████| 19032/19032 [00:05<00:00, 3493.73it/s]


monden/


100%|██████████| 2779/2779 [00:00<00:00, 7943.12it/s]


category/news/


100%|██████████| 11063/11063 [00:03<00:00, 3519.68it/s]


category/externe/


100%|██████████| 2970/2970 [00:00<00:00, 3677.03it/s]


politic/


100%|██████████| 1628/1628 [00:00<00:00, 6438.48it/s]


category/opinii/


100%|██████████| 126/126 [00:00<00:00, 1381.88it/s]


life-death/


100%|██████████| 2227/2227 [00:00<00:00, 7247.12it/s]


category/stiri-politica/


100%|██████████| 288/288 [00:00<00:00, 2309.82it/s]


it-stiinta/


100%|██████████| 807/807 [00:00<00:00, 5841.80it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 569/569 [00:00<00:00, 3640.49it/s]


category/analize/


100%|██████████| 711/711 [00:00<00:00, 1928.07it/s]


category/stirile-zilei/


100%|██████████| 1718/1718 [00:00<00:00, 3492.38it/s]


category/stiri-internationale/


100%|██████████| 277/277 [00:00<00:00, 2365.83it/s]


stiri/


100%|██████████| 2865/2865 [00:00<00:00, 5654.99it/s]


category/life/


100%|██████████| 284/284 [00:00<00:00, 2760.18it/s]


covid/


100%|██████████| 188/188 [00:00<00:00, 2372.14it/s]


sport/


100%|██████████| 339/339 [00:00<00:00, 2927.51it/s]





100%|██████████| 672/672 [00:00<00:00, 2526.45it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 53.23it/s]


category/utile/


100%|██████████| 337/337 [00:00<00:00, 2983.40it/s]


category/timp-liber/


100%|██████████| 154/154 [00:00<00:00, 1396.54it/s]


category/money/


100%|██████████| 125/125 [00:00<00:00, 1221.88it/s]


category/sanatate-si-medicina/


100%|██████████| 2/2 [00:00<00:00, 28.17it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 2/2 [00:00<00:00, 28.75it/s]


category/stiri-din-romania/


100%|██████████| 4/4 [00:00<00:00, 56.24it/s]


category/actualitate/


100%|██████████| 4/4 [00:00<00:00, 61.14it/s]


category/curiozitati-staiti-ca/


100%|██████████| 4/4 [00:00<00:00, 44.99it/s]


# TF-IDF

In [21]:
import tqdm
import sklearn.feature_extraction.text as skltext


In [22]:
tfidf_domain = []
for domain in tqdm.tqdm(df["domain"].unique()):

    vectorizer = skltext.TfidfVectorizer()
    tfidf = vectorizer.fit_transform(df[df.domain == domain]["content"])

    tfidf_domain.append(
        {
            "domain": domain,
            "tfidf": tfidf,
            "type": "valid" if valid_or_fake(domain) else "fake",
            "vectorizer": vectorizer,
        }
    )


100%|██████████| 33/33 [00:35<00:00,  1.08s/it]


In [24]:
# import gzip
# import pickle
# with gzip.open("../models/pickles/tfidf_per_domain_all_balanced.gzip", "wb") as o:
#     pickle.dump(tfidf_domain, o)


In [23]:
import gzip
import pickle

with gzip.open("../models/pickles/tfidf_per_domain_all_balanced.gzip", "rb") as o:
    tfidf_domain = pickle.load(o)


FileNotFoundError: [Errno 2] No such file or directory: '../models/pickles/tfidf_per_domain_all_balanced.gzip'

In [25]:
df_total: pd.DataFrame = pd.DataFrame(columns=["TF-IDF", "features", "domain", "type"])

for domain in tfidf_domain:
    tfidf_list = domain["tfidf"][0].T.todense()
    df = pd.DataFrame(
        list(
            zip(
                tfidf_list,
                domain["vectorizer"].get_feature_names(),
                [domain["domain"] for _ in range(len(tfidf_list))],
                [domain["type"] for _ in range(len(tfidf_list))],
            )
        ),
        columns=["TF-IDF", "features", "domain", "type"],
    )
    df = df.sort_values("TF-IDF", ascending=False)
    df_ = df.head(25)
    df_["TF-IDF"] = [elem.item(0) for elem in df_["TF-IDF"]]

    df_total = pd.concat([df_total, df_])



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [26]:
for domain in df_total["domain"].unique():
    fig = px.histogram(
        df_total[df_total.domain == domain],
        x="domain",
        y="TF-IDF",
        color="features",
        text_auto=".2s",
        barmode="group",
        title=f"{domain}",
    )
    fig.write_html(f"../visualization/charts/TFIDF_{domain.replace('/', '_')}_NER_tag.html")
    fig.write_image(f"../visualization/charts/TFIDF_{domain.replace('/', '_')}_NER_tag.png")


# Plot POS

In [32]:
import tqdm


In [33]:
nlp = spacy.load("ro_core_news_sm")

domain_word_count = []
for domain in df["domain"].unique():
    print(domain)
    pos_object_frequecy: dict[str, int] = {}
    pos_tag_object: dict[str, list[str]] = {}

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        doc = nlp(news)
        for ent in doc:

            if ent.text not in pos_object_frequecy.keys():
                pos_object_frequecy[ent.text] = 1
            else:
                pos_object_frequecy[ent.text] += 1

            if ent.pos_ not in pos_tag_object.keys():
                pos_tag_object[ent.pos_] = [ent.text]
            else:
                pos_tag_object[ent.pos_].append(ent.text)
    domain_word_count.append(
        {
            "domain": domain,
            "pos_object_frequecy": pos_object_frequecy,
            "pos_tag_object": pos_tag_object,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


stiri/actualitate


100%|██████████| 14727/14727 [19:39<00:00, 12.49it/s]


stiri/externe


100%|██████████| 8922/8922 [10:04<00:00, 14.77it/s]


stiri/actualitate/politica


100%|██████████| 13476/13476 [16:16<00:00, 13.81it/s]


stiri/economie


100%|██████████| 8388/8388 [10:18<00:00, 13.56it/s]


stiri/sci-tech


100%|██████████| 822/822 [01:06<00:00, 12.28it/s]


stiri/sport


100%|██████████| 1500/1500 [01:59<00:00, 12.57it/s]


category/politica/


100%|██████████| 19032/19032 [24:01<00:00, 13.20it/s] 


monden/


100%|██████████| 2779/2779 [01:50<00:00, 25.24it/s]


category/news/


100%|██████████| 11063/11063 [15:19<00:00, 12.03it/s]


category/externe/


100%|██████████| 2970/2970 [03:13<00:00, 15.38it/s]


politic/


100%|██████████| 1628/1628 [01:07<00:00, 24.08it/s]


category/opinii/


100%|██████████| 126/126 [00:14<00:00,  8.53it/s]


life-death/


100%|██████████| 2227/2227 [01:32<00:00, 24.20it/s]


category/stiri-politica/


100%|██████████| 288/288 [00:20<00:00, 14.19it/s]


it-stiinta/


100%|██████████| 807/807 [00:34<00:00, 23.70it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 569/569 [00:39<00:00, 14.29it/s]


category/analize/


100%|██████████| 711/711 [01:28<00:00,  8.08it/s]


category/stirile-zilei/


100%|██████████| 1718/1718 [01:47<00:00, 16.04it/s]


category/stiri-internationale/


100%|██████████| 277/277 [00:17<00:00, 15.96it/s]


stiri/


100%|██████████| 2865/2865 [01:40<00:00, 28.37it/s]


category/life/


100%|██████████| 284/284 [00:16<00:00, 17.03it/s]


covid/


100%|██████████| 188/188 [00:05<00:00, 35.24it/s]


sport/


100%|██████████| 339/339 [00:13<00:00, 24.40it/s]





100%|██████████| 672/672 [00:41<00:00, 16.03it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 17.26it/s]


category/utile/


100%|██████████| 337/337 [00:22<00:00, 14.87it/s]


category/timp-liber/


100%|██████████| 154/154 [00:13<00:00, 11.28it/s]


category/money/


100%|██████████| 125/125 [00:10<00:00, 12.08it/s]


category/sanatate-si-medicina/


100%|██████████| 2/2 [00:00<00:00,  7.58it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 2/2 [00:00<00:00,  6.15it/s]


category/stiri-din-romania/


100%|██████████| 4/4 [00:00<00:00, 22.44it/s]


category/actualitate/


100%|██████████| 4/4 [00:00<00:00, 24.33it/s]


category/curiozitati-staiti-ca/


100%|██████████| 4/4 [00:00<00:00,  9.65it/s]


In [34]:
import gzip
import pickle

with gzip.open("../models/pickles/pos_per_domain_balanced.gzip", "wb") as o:
    pickle.dump(domain_word_count, o)


In [18]:
# import gzip
# import pickle

# with gzip.open("../models/pickles/pos_per_domain_balanced.gzip", "rb") as o:
#     domain_word_count_fake = pickle.load(o)


In [35]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )

fig = px.histogram(domains, x="tag", y="tag_length", color="domain", text_auto=".2s")
fig.write_html(f"../visualization/charts/POS_domain_count_per_tag.html")
fig.write_image(f"../visualization/charts/POS_domain_count_per_tag.png")


In [36]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )

fig = px.histogram(domains, x="domain", y="tag_length", color="tag", text_auto=".2s")
fig.write_html(f"../visualization/charts/POS_tag_count_per_domain.html")
fig.write_image(f"../visualization/charts/POS_tag_count_per_domain.png")


In [37]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )

fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/POS_tag_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/POS_tag_count_per_domain_groupbar.png")


In [38]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(domains, x="domain", y="word_freq", color="word", text_auto=".2s")
fig.write_html(f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain.html")
fig.write_image(f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain.png")


In [39]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["pos_tag_object"]:
            if word in domain["pos_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break


fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s", barmode="group"
)
fig.write_html(
    f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain_groupbar.html"
)
fig.write_image(
    f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain_groupbar.png"
)


In [40]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["pos_tag_object"]:
            if word in domain["pos_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(
    domains, x="domain", y="word_freq", color="tag", text_auto=".2s", barmode="group"
)
fig.write_html(
    f"../visualization/charts/POS_top_10_words_by_frequency_per_domain_per_tag_groupbar.html"
)
fig.write_image(
    f"../visualization/charts/POS_top_10_words_by_frequency_per_domain_per_tag_groupbar.png"
)
