In [1]:
import re
import tqdm
import gzip
import pickle


import spacy
import numpy as np
import pandas as pd

from src.cleaner.model import Cleaner
import plotly.express as px

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [3]:
df = pd.read_parquet("../data/processed/scrapped_cleaned_fake.gzip")

# View cleaned data

In [4]:
df.columns

Index(['_id', 'site', 'domain', 'link', 'title', 'content', 'date',
       'title_skipped_because_min_length', 'title_skipped_alpha_count',
       'title_skipped_because_max_numeric',
       'title_skipped_because_max_non_ascii',
       'title_skipped_because_forbidden_chars',
       'content_skipped_because_min_length', 'content_skipped_alpha_count',
       'content_skipped_because_max_numeric',
       'content_skipped_because_max_non_ascii',
       'content_skipped_because_forbidden_chars'],
      dtype='object')

In [5]:
with open("../data/processed/stats/scrapped_fake.pickle", "rb") as o:
    stats = pickle.load(o)

In [7]:
cleaner = Cleaner(columns=["title", "content"])

cleaner.print_stats(stats=stats)


Cleaning statistics:
title Skipped because line length was below minimum (lines/chars): [ 161 4453]
title Skipped because line had forbidden characters (lines/chars): [0 0]
title Skipped because alpha count was below minimum (lines/chars): [0 0]
title Skipped because digit count was above maximum (lines/chars): [  28 1779]
title Skipped because too many non-ascii characters (lines/chars): [0 0]
content Skipped because line length was below minimum (lines/chars): [ 3416 20137]
content Skipped because line had forbidden characters (lines/chars): [    5 29648]
content Skipped because alpha count was below minimum (lines/chars): [   3 5065]
content Skipped because digit count was above maximum (lines/chars): [   8 9059]
content Skipped because too many non-ascii characters (lines/chars): [0 0]


# Words and sentences frequencies and count

In [8]:
import json
import pydantic


class CeleryConfig(pydantic.BaseModel):
    pass


class SiteConfig(pydantic.BaseModel):
    url: str
    type: str
    routes: list[str]
    confidence: float


class MongoDB(pydantic.BaseModel):
    pass


class Config(pydantic.BaseModel):
    mongodb: MongoDB
    celery: CeleryConfig
    site: list[SiteConfig]


with open("../cluster/configs/celery-beat/config.json", "r") as f:
    sites_cfg = Config.parse_raw(json.dumps(dict(json.load(f))))


In [9]:
sites_cfg.site

[SiteConfig(url='https://www.timesnewroman.ro/', type='news', routes=['politic/', 'sport/', 'life-death/', 'it-stiinta/', 'monden/'], confidence=0.25),
 SiteConfig(url='https://www.biziday.ro/', type='news', routes=[], confidence=0.5),
 SiteConfig(url='https://www.biziday.ro/feed', type='rss', routes=[], confidence=0.5),
 SiteConfig(url='https://www.digi24.ro/', type='news', routes=['stiri/actualitate/politica', 'stiri/actualitate', 'stiri/economie', 'stiri/externe', 'stiri/sport', 'stiri/sci-tech'], confidence=0.59),
 SiteConfig(url='https://www.aktual24.ro/', type='news', routes=['category/news/', 'category/politica/', 'category/analize/', 'category/opinii/', 'category/externe/', 'category/money/', 'category/life/'], confidence=0.44),
 SiteConfig(url='https://www.activenews.ro/', type='news', routes=['stiri/', 'opinii/', 'covid/', 'externe/', 'cultura/', 'economie/', 'atitudine-civica/'], confidence=0.48),
 SiteConfig(url='https://infoalert.ro/', type='news', routes=['category/stiri-

In [10]:
sites_cfg_site_routes = {}
for site in sites_cfg.site:
    sites_cfg_site_routes[site.confidence] = set(site.routes)


In [11]:
def valid_or_fake(domain_name: str):
    for confidence, domains in sites_cfg_site_routes.items():
        if domain_name in domains:
            if confidence >= 0.5:
                return True
            else:
                return False


In [12]:
domain_number = []

for domain in df["domain"].unique():
    domain_number.append(
        {
            "domain": domain,
            "length": len(df[df.domain == domain]),
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(domain_number, x="domain", y="length", color="type", text_auto=".2s")
fig.show()
fig.write_html(f"../visualization/charts_fake/article_count_per_domain.html")
fig.write_image(f"../visualization/charts_fake/article_count_per_domain.png")


In [15]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        domain_number.append(
            {
                "domain": domain,
                "length": len(df_[df_.domain == domain]),
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(domain_number, x="domain", y="length", color="year", text_auto=".2s")
fig.show()
fig.write_html(f"../visualization/charts_fake/article_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts_fake/article_count_per_domain_per_year.png")


In [16]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        domain_number.append(
            {
                "domain": domain,
                "length": len(df_[df_.domain == domain]),
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_number, x="domain", y="length", color="year", text_auto=".2s", barmode="group"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/article_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts_fake/article_count_per_domain_per_year_groupbar.png")

word count by domain

In [17]:
domain_word_count = []
for domain in df["domain"].unique():
    word_count: int = 0
    for news in df[df.domain == domain]["content"]:
        word_count += len(news.split(" "))
    domain_word_count.append(
        {
            "domain": domain,
            "word_count": word_count,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(domain_word_count, x="domain", y="word_count", color="type", text_auto=".2s")
fig.show()
fig.write_html(f"../visualization/charts_fake/word_count_per_domain.html")
fig.write_image(f"../visualization/charts_fake/word_count_per_domain.png")


In [18]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        word_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            word_count += len(news.split(" "))
        domain_number.append(
            {
                "domain": domain,
                "word_count": word_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(domain_number, x="domain", y="word_count", color="year", text_auto=".2s")
fig.show()
fig.write_html(f"../visualization/charts_fake/word_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts_fake/word_count_per_domain_per_year.png")


In [19]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        word_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            word_count += len(news.split(" "))
        domain_number.append(
            {
                "domain": domain,
                "word_count": word_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_number, x="domain", y="word_count", color="year", text_auto=".2s", barmode="group"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/word_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts_fake/word_count_per_domain_per_year_groupbar.png")


sentence count by domain

In [20]:
domain_sentence_count = []
for domain in df["domain"].unique():
    sentence_count: int = 0
    for news in df[df.domain == domain]["content"]:
        sentence_count += len(news.split("."))
    domain_sentence_count.append(
        {
            "domain": domain,
            "sentence_count": sentence_count,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(
    domain_sentence_count, x="domain", y="sentence_count", color="type", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/sentence_count_per_domain.html")
fig.write_image(f"../visualization/charts_fake/sentence_count_per_domain.png")


In [22]:
domain_sentence_count = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        sentence_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            sentence_count += len(news.split("."))
        domain_sentence_count.append(
            {
                "domain": domain,
                "sentence_count": sentence_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_sentence_count,
    x="domain",
    y="sentence_count",
    color="year",
    text_auto=".2s",
    barmode="group",
)
fig.show()
fig.write_html(f"../visualization/charts_fake/sentence_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts_fake/sentence_count_per_domain_per_year_groupbar.png")


In [17]:
# df["date"] = [pd.Timestamp(date, tz="UTC") for date in df["date"]]


In [23]:
domain_sentence_count = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        sentence_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            sentence_count += len(news.split("."))
        domain_sentence_count.append(
            {
                "domain": domain,
                "sentence_count": sentence_count,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_sentence_count, x="domain", y="sentence_count", color="year", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/sentence_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts_fake/sentence_count_per_domain_per_year.png")


average word length by domain

In [24]:
domain_word_count = []
for domain in df["domain"].unique():
    word_count: list[int] = []
    for news in df[df.domain == domain]["content"]:
        word_count.append(np.mean([len(news) for news in news.split(" ")]))
    domain_word_count.append(
        {
            "domain": domain,
            "word_length_average": np.mean(word_count),
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(
    domain_word_count, x="domain", y="word_length_average", color="type", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/average_word_length_by_domain.html")
fig.write_image(f"../visualization/charts_fake/average_word_length_by_domain.png")


average sentence length by domain

In [25]:
domain_word_count = []
for domain in df["domain"].unique():
    sentence_average: list[int] = []
    for news in df[df.domain == domain]["content"]:
        sentence_average.append(np.mean([len(news) for news in news.split(".")]))
    domain_word_count.append(
        {
            "domain": domain,
            "sentence_length_average": np.mean(sentence_average),
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


fig = px.histogram(
    domain_word_count, x="domain", y="sentence_length_average", color="type", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/average_sentence_length_by_domain.html")
fig.write_image(f"../visualization/charts_fake/average_sentence_length_by_domain.png")


sentiment analysis per domains

```
pip install -U pip setuptools wheel
pip install -U 'spacy[cuda113]'
python -m spacy ro_core_news_sm
```

# NER by domain

In [23]:
import tqdm
import gzip
import pickle

nlp = spacy.load("ro_core_news_sm")

domain_word_count1 = []
for domain in df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    print(domain)

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        doc = nlp(news)
        for ent in doc.ents:

            if ent.text not in ner_object_frequecy.keys():
                ner_object_frequecy[ent.text] = 1
            else:
                ner_object_frequecy[ent.text] += 1

            if ent.label_ not in ner_tag_object.keys():
                ner_tag_object[ent.label_] = [ent.text]
            else:
                ner_tag_object[ent.label_].append(ent.text)
    domain_word_count1.append(
        {
            "domain": domain,
            "ner_object_frequecy": ner_object_frequecy,
            "ner_tag_object": ner_tag_object,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )
with gzip.open(f"../models/pickles/docs_per_domain_fake.gzip", "wb") as o:
    pickle.dump(domain_word_count1, o)


category/analize/


100%|██████████| 1045/1045 [02:19<00:00,  7.47it/s]


category/news/


100%|██████████| 18684/18684 [23:36<00:00, 13.19it/s] 


category/externe/


100%|██████████| 5820/5820 [05:50<00:00, 16.59it/s]


category/life/


100%|██████████| 1209/1209 [01:07<00:00, 17.99it/s]


category/politica/


100%|██████████| 26386/26386 [33:02<00:00, 13.31it/s] 


category/opinii/


100%|██████████| 207/207 [00:23<00:00,  8.92it/s]


category/money/


100%|██████████| 260/260 [00:19<00:00, 13.10it/s]


stiri/


100%|██████████| 4176/4176 [02:28<00:00, 28.04it/s] 


category/curiozitati-staiti-ca/


100%|██████████| 17/17 [00:01<00:00, 12.04it/s]


category/actualitate/


100%|██████████| 5/5 [00:00<00:00, 27.40it/s]


category/stiri-din-romania/


100%|██████████| 14/14 [00:00<00:00, 18.05it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 5/5 [00:00<00:00, 10.10it/s]


category/sanatate-si-medicina/


100%|██████████| 9/9 [00:00<00:00, 21.45it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 15.91it/s]


category/stirile-zilei/


100%|██████████| 13793/13793 [11:20<00:00, 20.26it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 2241/2241 [02:20<00:00, 15.98it/s]


category/stiri-internationale/


100%|██████████| 1080/1080 [01:03<00:00, 17.06it/s]


category/timp-liber/


100%|██████████| 704/704 [00:54<00:00, 13.02it/s]


category/stiri-politica/


100%|██████████| 811/811 [00:50<00:00, 15.91it/s]


category/utile/


100%|██████████| 1257/1257 [01:19<00:00, 15.91it/s]





100%|██████████| 10/10 [00:00<00:00, 18.82it/s]


covid/


100%|██████████| 234/234 [00:05<00:00, 42.95it/s]


monden/


100%|██████████| 93/93 [00:04<00:00, 22.93it/s]


Create nlp objects

In [27]:
with gzip.open(f"../models/pickles/docs_per_domain_fake.gzip", "rb") as o:
    domain_word_count = pickle.load(o)

In [66]:
df["domain"].unique()

array(['monden/', 'it-stiinta/', 'sport/', 'politic/', 'life-death/', '',
       'stiri/economie', 'stiri/externe', 'stiri/actualitate',
       'stiri/actualitate/politica', 'stiri/sport', 'stiri/sci-tech'],
      dtype=object)

In [6]:
import tqdm
import gzip
import pickle

nlp = spacy.load("ro_core_news_sm")

for domain in df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    print(domain)

    docs = []

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        docs.append(nlp(news))

    domain_dict = {
        "domain": domain,
        "docs": docs,
    }

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "wb") as o:
        pickle.dump(domain_dict, o)


KeyboardInterrupt: 

## Plot NER

In [29]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="tag", y="tag_length", color="domain", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/NER_domain_count_per_tag.html")
fig.write_image(f"../visualization/charts_fake/NER_domain_count_per_tag.png")

In [30]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/NER_tag_count_per_domain.html")
fig.write_image(f"../visualization/charts_fake/NER_tag_count_per_domain.png")

In [31]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s", barmode='group'
)
fig.show()
fig.write_html(f"../visualization/charts_fake/NER_tag_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts_fake/NER_tag_count_per_domain_groupbar.png")

In [32]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(domains, x="domain", y="word_freq", color="word", text_auto=".2s")
fig.show()
fig.write_html(f"../visualization/charts_fake/NER_word_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts_fake/NER_word_count_per_domain_groupbar.png")


In [34]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["ner_tag_object"]:
            if word in domain["ner_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break


fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s", barmode="group"
)
fig.show()
fig.write_html(
    f"../visualization/charts_fake/NER_top_10_words_by_frequency_per_domain_groupbar.html"
)
fig.write_image(
    f"../visualization/charts_fake/NER_top_10_words_by_frequency_per_domain_groupbar.png"
)


In [35]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["ner_tag_object"]:
            if word in domain["ner_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(
    domains, x="domain", y="word_freq", color="tag", text_auto=".2s", barmode="group"
)
fig.show()
fig.write_html(
    f"../visualization/charts_fake/NER_top_10_tags_by_frequency_per_domain_groupbar.html"
)
fig.write_image(
    f"../visualization/charts_fake/NER_top_10_tags_by_frequency_per_domain_groupbar.png"
)


# N-grams

In [None]:
# TODO ?

# Keyword extraction

wait for domain_docs execution

In [4]:
import gzip
import tqdm
import spacy
import pickle
import string

In [None]:
import gc
gc.enable()

In [5]:
nlp = spacy.load("ro_core_news_sm")
pos_tag = ["PROPN", "ADJ", "NOUN"]

for domain in df["domain"].unique():

    print(domain)

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "rb") as o:
        domain_word_count = pickle.load(o)

    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    for doc in tqdm.tqdm(domain_word_count["docs"]):
        for doc, ent in zip(doc, doc.ents):

            if doc.text in nlp.Defaults.stop_words or doc.text in string.punctuation:
                continue

            if doc.pos_ in pos_tag:
                if ent.text not in ner_object_frequecy.keys():
                    ner_object_frequecy[ent.text] = 1
                else:
                    ner_object_frequecy[ent.text] += 1

                if ent.label_ not in ner_tag_object.keys():
                    ner_tag_object[ent.label_] = [ent.text]
                else:
                    ner_tag_object[ent.label_].append(ent.text)

    sorted_tags = dict(sorted(ner_object_frequecy.items(), key=lambda item: item[1], reverse=True))
    index: int = 10

    del domain_word_count
    gc.collect()

    domains = []
    for word in sorted_tags:
        tag_: str = ""
        for tag in ner_tag_object:
            if word in ner_tag_object[tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "word": word,
                "word_freq": ner_object_frequecy[word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

    print(f"KeyWord_{domain.replace('/', '_')}_NER_tag")

    fig = px.histogram(
        domains,
        x="word",
        y="word_freq",
        color="tag",
        text_auto=".2s",
        barmode="group",
        title=f"KeyWord_{domain.replace('/', '_')}_NER_tag",
    )
    fig.show()
    fig.write_html(f"../visualization/charts_fake/KeyWord_{domain.replace('/', '_')}_NER_tag.html")
    fig.write_image(f"../visualization/charts_fake/KeyWord_{domain.replace('/', '_')}_NER_tag.png")


category/analize/


100%|██████████| 1045/1045 [00:00<00:00, 1641.07it/s]


KeyWord_category_analize__NER_tag


category/news/


100%|██████████| 18684/18684 [00:40<00:00, 465.90it/s] 


KeyWord_category_news__NER_tag


category/externe/


100%|██████████| 5820/5820 [00:02<00:00, 2596.84it/s]


KeyWord_category_externe__NER_tag


category/life/


100%|██████████| 1209/1209 [00:00<00:00, 3896.04it/s]


KeyWord_category_life__NER_tag


category/politica/


# TF-IDF

In [36]:
import tqdm
import sklearn.feature_extraction.text as skltext

In [23]:
tfidf_domain = []
for domain in tqdm.tqdm(df["domain"].unique()):
    
    vectorizer = skltext.TfidfVectorizer()
    tfidf = vectorizer.fit_transform(df[df.domain == domain]["content"])

    tfidf_domain.append(
        {
            "domain": domain,
            "tfidf": tfidf,
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
            "vectorizer": vectorizer
        }
    )

100%|██████████| 23/23 [00:30<00:00,  1.31s/it]


In [25]:
# import gzip
# import pickle
# with gzip.open("../models/pickles/tfidf_per_domain.gzip_fake", "wb") as o:
#     pickle.dump(tfidf_domain, o)

In [38]:
import gzip
import pickle
with gzip.open("../models/pickles/tfidf_per_domain.gzip_fake", "rb") as o:
    tfidf_domain = pickle.load(o)

In [39]:
df_total: pd.DataFrame = pd.DataFrame(columns=["TF-IDF", "features", "domain", "type"])

for domain in tfidf_domain:
    tfidf_list = domain["tfidf"][0].T.todense()
    df = pd.DataFrame(
        list(
            zip(
                tfidf_list,
                domain["vectorizer"].get_feature_names(),
                [domain["domain"] for _ in range(len(tfidf_list))],
                [domain["type"] for _ in range(len(tfidf_list))],
            )
        ),
        columns=["TF-IDF", "features", "domain", "type"],
    )
    df = df.sort_values("TF-IDF", ascending=False)
    df_ = df.head(25)
    df_["TF-IDF"] = [elem.item(0) for elem in df_["TF-IDF"]]

    df_total = pd.concat([df_total, df_])



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [40]:
for domain in df_total["domain"].unique():
    print(f"TFIDF_{domain.replace('/', '_')}_NER_tag")
    fig = px.histogram(
        df_total[df_total.domain == domain],
        x="domain",
        y="TF-IDF",
        color="features",
        text_auto=".2s",
        barmode="group",
        title=f"{domain}",
    )
    fig.show()
    fig.write_html(f"../visualization/charts_fake/TFIDF_{domain.replace('/', '_')}_NER_tag.html")
    fig.write_image(f"../visualization/charts_fake/TFIDF_{domain.replace('/', '_')}_NER_tag.png")


TFIDF_category_analize__NER_tag


TFIDF_category_news__NER_tag


TFIDF_category_externe__NER_tag


TFIDF_category_life__NER_tag


TFIDF_category_politica__NER_tag


TFIDF_category_opinii__NER_tag


TFIDF_category_money__NER_tag


TFIDF_stiri__NER_tag


TFIDF_category_curiozitati-staiti-ca__NER_tag


TFIDF_category_actualitate__NER_tag


TFIDF_category_stiri-din-romania__NER_tag


TFIDF_category_mistere-de-pe-mapamond__NER_tag


TFIDF_category_sanatate-si-medicina__NER_tag


TFIDF_category_stiri-de-pe-mapamond__NER_tag


TFIDF_category_stirile-zilei__NER_tag


TFIDF_category_stiri-sanatate-stiri-fitness-stiri-slabire__NER_tag


TFIDF_category_stiri-internationale__NER_tag


TFIDF_category_timp-liber__NER_tag


TFIDF_category_stiri-politica__NER_tag


TFIDF_category_utile__NER_tag


TFIDF__NER_tag


TFIDF_covid__NER_tag


TFIDF_monden__NER_tag


# Plot POS

In [41]:
import tqdm

In [44]:
nlp = spacy.load("ro_core_news_sm")

domain_word_count = []
for domain in df["domain"].unique():
    print(domain)
    pos_object_frequecy: dict[str, int] = {}
    pos_tag_object: dict[str, list[str]] = {}

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        doc = nlp(news)
        for ent in doc:

            if ent.text not in pos_object_frequecy.keys():
                pos_object_frequecy[ent.text] = 1
            else:
                pos_object_frequecy[ent.text] += 1

            if ent.pos_ not in pos_tag_object.keys():
                pos_tag_object[ent.pos_] = [ent.text]
            else:
                pos_tag_object[ent.pos_].append(ent.text)
    domain_word_count.append(
        {
            "domain": domain,
            "pos_object_frequecy": pos_object_frequecy,
            "pos_tag_object": pos_tag_object,
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )



category/analize/


100%|██████████| 1045/1045 [02:15<00:00,  7.70it/s]


category/news/


100%|██████████| 18684/18684 [25:41<00:00, 12.12it/s] 


category/externe/


100%|██████████| 5820/5820 [05:37<00:00, 17.23it/s]


category/life/


100%|██████████| 1209/1209 [01:04<00:00, 18.89it/s]


category/politica/


100%|██████████| 26386/26386 [37:36<00:00, 11.69it/s] 


category/opinii/


100%|██████████| 207/207 [00:24<00:00,  8.35it/s]


category/money/


100%|██████████| 260/260 [00:21<00:00, 12.33it/s]


stiri/


100%|██████████| 4176/4176 [02:42<00:00, 25.72it/s] 


category/curiozitati-staiti-ca/


100%|██████████| 17/17 [00:01<00:00, 12.10it/s]


category/actualitate/


100%|██████████| 5/5 [00:00<00:00, 27.23it/s]


category/stiri-din-romania/


100%|██████████| 14/14 [00:00<00:00, 18.57it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 5/5 [00:00<00:00, 10.23it/s]


category/sanatate-si-medicina/


100%|██████████| 9/9 [00:00<00:00, 22.68it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 15.42it/s]


category/stirile-zilei/


100%|██████████| 13793/13793 [11:40<00:00, 19.70it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 2241/2241 [02:18<00:00, 16.21it/s]


category/stiri-internationale/


100%|██████████| 1080/1080 [00:58<00:00, 18.41it/s]


category/timp-liber/


100%|██████████| 704/704 [00:51<00:00, 13.61it/s]


category/stiri-politica/


100%|██████████| 811/811 [00:48<00:00, 16.58it/s]


category/utile/


100%|██████████| 1257/1257 [01:15<00:00, 16.64it/s]





100%|██████████| 10/10 [00:00<00:00, 19.62it/s]


covid/


100%|██████████| 234/234 [00:05<00:00, 44.67it/s]


monden/


100%|██████████| 93/93 [00:03<00:00, 23.64it/s]


In [45]:
import gzip
import pickle
with gzip.open("../models/pickles/pos_per_domain_fake.gzip", "wb") as o:
    pickle.dump(domain_word_count, o)

In [None]:
import gzip
import pickle
with gzip.open("../models/pickles/pos_per_domain_fake.gzip", "rb") as o:
    domain_word_count = pickle.dump(o)

In [46]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="tag", y="tag_length", color="domain", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/POS_domain_count_per_tag.html")
fig.write_image(f"../visualization/charts_fake/POS_domain_count_per_tag.png")


In [47]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/POS_tag_count_per_domain.html")
fig.write_image(f"../visualization/charts_fake/POS_tag_count_per_domain.png")

In [48]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s", barmode='group'
)
fig.show()
fig.write_html(f"../visualization/charts_fake/POS_tag_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts_fake/POS_tag_count_per_domain_groupbar.png")

In [49]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1],  reverse=True))
    index: int = 10

    for word in sorted_tags:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s"
)
fig.show()
fig.write_html(f"../visualization/charts_fake/POS_top_10_words_by_frequency_per_tag_per_domain.html")
fig.write_image(f"../visualization/charts_fake/POS_top_10_words_by_frequency_per_tag_per_domain.png")

In [50]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["pos_tag_object"]:
            if word in domain["pos_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break


fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s", barmode='group'
)
fig.show()
fig.write_html(f"../visualization/charts_fake/POS_top_10_words_by_frequency_per_tag_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts_fake/POS_top_10_words_by_frequency_per_tag_per_domain_groupbar.png")

In [51]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["pos_tag_object"]:
            if word in domain["pos_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(domains, x="domain", y="word_freq", color="tag", text_auto=".2s", barmode='group')
fig.show()
fig.write_html(f"../visualization/charts_fake/POS_top_10_words_by_frequency_per_domain_per_tag_groupbar.html")
fig.write_image(f"../visualization/charts_fake/POS_top_10_words_by_frequency_per_domain_per_tag_groupbar.png")