In [5]:
import re
import pickle

import spacy
import numpy as np
import pandas as pd
from src.cleaner.model import Cleaner
import plotly.express as px

In [46]:
import json
with open("../data/processed/scrapped.json", "r") as f:
    data = json.load(f)

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [6]:
df1 = pd.read_parquet("../data/processed/scrapped_cleaned.gzip")

In [7]:
df2 = pd.read_parquet("../data/processed/scrapped_cleaned_fake.gzip")


In [8]:
df = pd.concat([df1, df2])

226748 - inainte de curatare si balansare

In [11]:
df.site.unique()

array(['https://www.timesnewroman.ro/', 'https://www.biziday.ro/',
       'https://www.biziday.ro/feed', 'https://www.digi24.ro/',
       'https://www.aktual24.ro/', 'https://www.activenews.ro/',
       'https://infoalert.ro/', 'https://caplimpede.ro/'], dtype=object)

In [42]:
len(df[df.site == "https://www.aktual24.ro/"])

53611

In [44]:
np.array([pd.Timestamp(x) for x in df[df.site == "https://www.aktual24.ro/"].date]).max()

Timestamp('2022-03-20 00:00:00+0000', tz='UTC')

In [10]:
len(df)

226748

# License stats

In [7]:
print(df.site.unique())

['https://www.timesnewroman.ro/' 'https://www.biziday.ro/'
 'https://www.biziday.ro/feed' 'https://www.digi24.ro/'
 'https://www.aktual24.ro/' 'https://www.activenews.ro/'
 'https://infoalert.ro/' 'https://caplimpede.ro/']


In [9]:
print(df["site"]["https://www.timesnewroman.ro/"].unique())

KeyError: 'https://www.timesnewroman.ro/'

In [9]:
df = df[["site", "title", "domain", "content", "date"]]

In [11]:
df["date"] = [pd.Timestamp(x) for x in df["date"]]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
df.to_parquet("../data/processed/scrapped_cleaned_valid_and_fake.gzip")

In [3]:
df = pd.read_parquet("../data/processed/scrapped_cleaned_valid_and_fake.gzip")


# View cleaned data

In [6]:
df.columns

Index(['_id', 'site', 'domain', 'title', 'content', 'date', 'link',
       'title_skipped_because_min_length', 'title_skipped_alpha_count',
       'title_skipped_because_max_numeric',
       'title_skipped_because_max_non_ascii',
       'title_skipped_because_forbidden_chars',
       'content_skipped_because_min_length', 'content_skipped_alpha_count',
       'content_skipped_because_max_numeric',
       'content_skipped_because_max_non_ascii',
       'content_skipped_because_forbidden_chars'],
      dtype='object')

# Words and sentences frequencies and count

In [5]:
domain_number = []

for domain in df["domain"].unique():
    domain_number.append(
        {
            "domain": domain,
            "length": len(df[df.domain == domain]),
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )


fig = px.histogram(domain_number, x="domain", y="length", color="type", text_auto=".2s")
fig.write_html(f"../visualization/charts/article_count_per_domain.html")
fig.write_image(f"../visualization/charts/article_count_per_domain.png")


In [6]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        domain_number.append(
            {
                "domain": domain,
                "length": len(df_[df_.domain == domain]),
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(domain_number, x="domain", y="length", color="year", text_auto=".2s")
fig.write_html(f"../visualization/charts/article_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts/article_count_per_domain_per_year.png")


In [7]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        domain_number.append(
            {
                "domain": domain,
                "length": len(df_[df_.domain == domain]),
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_number, x="domain", y="length", color="year", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/article_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/article_count_per_domain_per_year_groupbar.png")


word count by domain

In [8]:
domain_word_count = []
for domain in df["domain"].unique():
    word_count: int = 0
    for news in df[df.domain == domain]["content"]:
        word_count += len(news.split(" "))
    domain_word_count.append(
        {
            "domain": domain,
            "word_count": word_count,
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )


fig = px.histogram(domain_word_count, x="domain", y="word_count", color="type", text_auto=".2s")
fig.write_html(f"../visualization/charts/word_count_per_domain.html")
fig.write_image(f"../visualization/charts/word_count_per_domain.png")


In [9]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        word_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            word_count += len(news.split(" "))
        domain_number.append(
            {
                "domain": domain,
                "word_count": word_count,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(domain_number, x="domain", y="word_count", color="year", text_auto=".2s")
fig.write_html(f"../visualization/charts/word_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts/word_count_per_domain_per_year.png")


In [10]:
domain_number = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        word_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            word_count += len(news.split(" "))
        domain_number.append(
            {
                "domain": domain,
                "word_count": word_count,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_number, x="domain", y="word_count", color="year", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/word_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/word_count_per_domain_per_year_groupbar.png")


sentence count by domain

In [11]:
domain_sentence_count = []
for domain in df["domain"].unique():
    sentence_count: int = 0
    for news in df[df.domain == domain]["content"]:
        sentence_count += len(news.split("."))
    domain_sentence_count.append(
        {
            "domain": domain,
            "sentence_count": sentence_count,
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )


fig = px.histogram(
    domain_sentence_count, x="domain", y="sentence_count", color="type", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/sentence_count_per_domain.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain.png")


In [12]:
domain_sentence_count = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        sentence_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            sentence_count += len(news.split("."))
        domain_sentence_count.append(
            {
                "domain": domain,
                "sentence_count": sentence_count,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_sentence_count,
    x="domain",
    y="sentence_count",
    color="year",
    text_auto=".2s",
    barmode="group",
)
fig.write_html(f"../visualization/charts/sentence_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain_per_year_groupbar.png")


In [13]:
domain_sentence_count = []
time_ = pd.Timestamp("2010-01-01 00:00:00+0000", tz="UTC")
year = pd.Timedelta(days=366)
while time_ < pd.Timestamp.utcnow():
    df_ = df[(df.date > time_) & (df.date <= time_ + year)]
    for domain in df["domain"].unique():
        sentence_count: int = 0
        for news in df_[df_.domain == domain]["content"]:
            sentence_count += len(news.split("."))
        domain_sentence_count.append(
            {
                "domain": domain,
                "sentence_count": sentence_count,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
                "year": time_,
            }
        )
    time_ += year


fig = px.histogram(
    domain_sentence_count, x="domain", y="sentence_count", color="year", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/sentence_count_per_domain_per_year.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain_per_year.png")


In [14]:
years = [domain["year"] for domain in domain_sentence_count]
fig = px.bar(domain_sentence_count, x="domain", y="sentence_count", color="domain", barmode="group",
             facet_row="domain", facet_col="year",
             category_orders={"day": [time_ ],
                              "domain": ["fake", "valid"]})
fig.write_html(f"../visualization/charts/sentence_count_per_domain_per_year_groupbar.html")
fig.write_image(f"../visualization/charts/sentence_count_per_domain_per_year_groupbar.png")


ValueError: Vertical spacing cannot be greater than (1 / (rows - 1)) = 0.029412.
The resulting plot would have 35 rows (rows=35).
Use the facet_row_spacing argument to adjust this spacing.

average word length by domain

In [15]:
domain_word_count = []
for domain in df["domain"].unique():
    word_count: list[int] = []
    for news in df[df.domain == domain]["content"]:
        word_count.append(np.mean([len(news) for news in  news.split(" ")]))
    domain_word_count.append(
        {
            "domain": domain,
            "word_length_average": np.mean(word_count),
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )


fig = px.histogram(
    domain_word_count, x="domain", y="word_length_average", color="type", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/average_word_length_by_domain.html")
fig.write_image(f"../visualization/charts/average_word_length_by_domain.png")


average sentence length by domain

In [16]:
domain_word_count = []
for domain in df["domain"].unique():
    sentence_average: list[int] = []
    for news in df[df.domain == domain]["content"]:
        sentence_average.append(np.mean([len(news) for news in  news.split(".")]))
    domain_word_count.append(
        {
            "domain": domain,
            "sentence_length_average": np.mean(sentence_average),
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )


fig = px.histogram(
    domain_word_count, x="domain", y="sentence_length_average", color="type", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/average_sentence_length_by_domain.html")
fig.write_image(f"../visualization/charts/average_sentence_length_by_domain.png")


sentiment analysis per domains

```
pip install -U pip setuptools wheel
pip install -U 'spacy[cuda113]'
python -m spacy ro_core_news_sm
```

# NER by domain

In [None]:
nlp = spacy.load("ro_core_news_sm")

domain_word_count1 = []
for domain in df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    print(domain)

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        doc = nlp(news)
        for ent in doc.ents:

            if ent.text not in ner_object_frequecy.keys():
                ner_object_frequecy[ent.text] = 1
            else:
                ner_object_frequecy[ent.text] += 1

            if ent.label_ not in ner_tag_object.keys():
                ner_tag_object[ent.label_] = [ent.text]
            else:
                ner_tag_object[ent.label_].append(ent.text)
    domain_word_count1.append(
        {
            "domain": domain,
            "ner_object_frequecy": ner_object_frequecy,
            "ner_tag_object": ner_tag_object,
            "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
        }
    )


In [None]:
import gzip
import pickle
with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "wb") as o:
    pickle.dump(domain_word_count1, o)

In [25]:
domains = [domain for domain in df["domain"].unique()]

In [None]:
import gzip
import pickle
with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "rb") as o:
    domain_word_count = pickle.load(o)

Create nlp objects

In [66]:
df["domain"].unique()

array(['monden/', 'it-stiinta/', 'sport/', 'politic/', 'life-death/', '',
       'stiri/economie', 'stiri/externe', 'stiri/actualitate',
       'stiri/actualitate/politica', 'stiri/sport', 'stiri/sci-tech'],
      dtype=object)

In [6]:
import tqdm
import gzip
import pickle

nlp = spacy.load("ro_core_news_sm")

for domain in df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    print(domain)

    docs = []

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        docs.append(nlp(news))

    domain_dict = {
        "domain": domain,
        "docs": docs,
    }

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "wb") as o:
        pickle.dump(domain_dict, o)


stiri/externe


100%|██████████| 26892/26892 [37:00<00:00, 12.11it/s]  


In [4]:
import tqdm
import gzip
import pickle

for domain in df["domain"].unique():
    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "rb") as o:
        domain_dict = pickle.load(o)

    print(domain)

    for doc in tqdm.tqdm(domain_dict["docs"]):
        for ent in doc.ents:

            if ent.text not in ner_object_frequecy.keys():
                ner_object_frequecy[ent.text] = 1
            else:
                ner_object_frequecy[ent.text] += 1

            if ent.label_ not in ner_tag_object.keys():
                ner_tag_object[ent.label_] = [ent.text]
            else:
                ner_tag_object[ent.label_].append(ent.text)

    with gzip.open(f"../models/pickles/ner_per_domain_{domain.replace('/', '_')}.gzip", "wb") as o:
        pickle.dump(
            {
                "domain": domain,
                "ner_object_frequecy": ner_object_frequecy,
                "ner_tag_object": ner_tag_object,
                "type": "valid" if ("stiri" in domain) or ("" == domain) else "fake",
            },
            o,
        )


category/news/


100%|██████████| 18684/18684 [00:07<00:00, 2396.60it/s]


category/externe/


100%|██████████| 5820/5820 [00:02<00:00, 2777.67it/s]


category/life/


100%|██████████| 1209/1209 [00:00<00:00, 3461.16it/s]


category/politica/


100%|██████████| 26386/26386 [00:20<00:00, 1285.80it/s]


category/opinii/


100%|██████████| 207/207 [00:00<00:00, 921.11it/s]


category/money/


100%|██████████| 260/260 [00:00<00:00, 1606.95it/s]


stiri/


100%|██████████| 4176/4176 [00:00<00:00, 5329.61it/s]


category/curiozitati-staiti-ca/


100%|██████████| 17/17 [00:00<00:00, 212.54it/s]


category/actualitate/


100%|██████████| 5/5 [00:00<00:00, 99.89it/s]


category/stiri-din-romania/


100%|██████████| 14/14 [00:00<00:00, 275.50it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 5/5 [00:00<00:00, 90.59it/s]


category/sanatate-si-medicina/


100%|██████████| 9/9 [00:00<00:00, 184.61it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 80.78it/s]


category/stirile-zilei/


100%|██████████| 13793/13793 [00:03<00:00, 3589.88it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 2241/2241 [00:00<00:00, 4252.17it/s]


category/stiri-internationale/


100%|██████████| 1080/1080 [00:00<00:00, 2696.66it/s]


category/timp-liber/


100%|██████████| 704/704 [00:00<00:00, 1760.12it/s]


category/stiri-politica/


100%|██████████| 811/811 [00:00<00:00, 2267.35it/s]


category/utile/


100%|██████████| 1257/1257 [00:00<00:00, 3896.43it/s]


covid/


100%|██████████| 234/234 [00:00<00:00, 2540.93it/s]


## Plot NER

In [5]:
import gzip
import pickle
domain_word_count = []
for domain in df["domain"].unique():
    with gzip.open(f"../models/pickles/ner_per_domain_{domain.replace('/', '_')}.gzip", "rb") as o:
        domain_dict = pickle.load(o)

    domain_word_count.append(domain_dict)

In [6]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )

fig = px.histogram(domains, x="tag", y="tag_length", color="domain", text_auto=".2s")
fig.write_html(f"../visualization/charts/NER_domain_count_per_tag.html")
fig.write_image(f"../visualization/charts/NER_domain_count_per_tag.png")


In [7]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/NER_tag_count_per_domain.html")
fig.write_image(f"../visualization/charts/NER_tag_count_per_domain.png")

In [8]:
domains = []
for domain in domain_word_count:
    for tag in domain["ner_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["ner_tag_object"][tag]),
            }
        )

fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s", barmode="group"
)
fig.write_html(f"../visualization/charts/NER_tag_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/NER_tag_count_per_domain_groupbar.png")


In [9]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(domains, x="domain", y="word_freq", color="word", text_auto=".2s")
fig.write_html(f"../visualization/charts/NER_word_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/NER_word_count_per_domain_groupbar.png")


In [10]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["ner_tag_object"]:
            if word in domain["ner_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break


fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s", barmode="group"
)
fig.write_html(
    f"../visualization/charts/NER_top_10_words_by_frequency_per_domain_groupbar.html"
)
fig.write_image(
    f"../visualization/charts/NER_top_10_words_by_frequency_per_domain_groupbar.png"
)


In [11]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["ner_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["ner_tag_object"]:
            if word in domain["ner_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["ner_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(
    domains, x="domain", y="word_freq", color="tag", text_auto=".2s", barmode="group"
)

fig.write_html(
    f"../visualization/charts/NER_top_10_tags_by_frequency_per_domain_groupbar.html"
)
fig.write_image(
    f"../visualization/charts/NER_top_10_tags_by_frequency_per_domain_groupbar.png"
)


# N-grams

In [None]:
# TODO ?

# Keyword extraction

wait for domain_docs execution

In [6]:
import gzip
import tqdm
import spacy
import pickle
import string
import gc


In [10]:
import json
import pydantic


class CeleryConfig(pydantic.BaseModel):
    pass


class SiteConfig(pydantic.BaseModel):
    url: str
    type: str
    routes: list[str]
    confidence: float


class MongoDB(pydantic.BaseModel):
    pass


class Config(pydantic.BaseModel):
    mongodb: MongoDB
    celery: CeleryConfig
    site: list[SiteConfig]


with open("../cluster/configs/celery-beat/config.json", "r") as f:
    sites_cfg = Config.parse_raw(json.dumps(dict(json.load(f))))

sites_cfg_site_routes = {}
for site in sites_cfg.site:
    sites_cfg_site_routes[site.confidence] = set(site.routes)


def valid_or_fake(domain_name: str):
    for confidence, domains in sites_cfg_site_routes.items():
        if domain_name in domains:
            if confidence >= 0.5:
                return True
            else:
                return False


In [7]:
nlp = spacy.load("ro_core_news_sm")
pos_tag = ["PROPN", "ADJ", "NOUN"]

domain_word_count1 = []
for domain in df["domain"].unique():

    print(domain)

    with gzip.open(f"../models/pickles/docs_per_domain_{domain.replace('/', '_')}.gzip", "rb") as o:
        domain_word_count = pickle.load(o)

    ner_object_frequecy: dict[str, int] = {}
    ner_tag_object: dict[str, list[str]] = {}

    for doc in tqdm.tqdm(domain_word_count["docs"]):
        for doc, ent in zip(doc, doc.ents):

            if doc.text in nlp.Defaults.stop_words or doc.text in string.punctuation:
                continue

            if doc.pos_ in pos_tag:
                if ent.text not in ner_object_frequecy.keys():
                    ner_object_frequecy[ent.text] = 1
                else:
                    ner_object_frequecy[ent.text] += 1

                if ent.label_ not in ner_tag_object.keys():
                    ner_tag_object[ent.label_] = [ent.text]
                else:
                    ner_tag_object[ent.label_].append(ent.text)

    sorted_tags = dict(sorted(ner_object_frequecy.items(), key=lambda item: item[1], reverse=True))
    index: int = 10

    del domain_word_count

    gc.collect()

    domains = []
    for word in sorted_tags:
        tag_: str = ""
        for tag in ner_tag_object:
            if word in ner_tag_object[tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain,
                "type": "valid" if valid_or_fake(domain) else "fake",
                "word": word,
                "word_freq": ner_object_frequecy[word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

    fig = px.histogram(
        domains, x="word", y="word_freq", color="tag", text_auto=".2s", barmode="group"
    )
    fig.write_html(f"../visualization/charts/KeyWord_{domain.replace('/', '_')}_NER_tag.html")
    fig.write_image(f"../visualization/charts/KeyWord_{domain.replace('/', '_')}_NER_tag.png")

    del fig
    gc.collect()


stiri/actualitate/politica


100%|██████████| 32783/32783 [00:23<00:00, 1383.65it/s]


stiri/sport


100%|██████████| 3404/3404 [00:02<00:00, 1569.15it/s]


stiri/sci-tech


100%|██████████| 2208/2208 [00:00<00:00, 2713.78it/s]


category/analize/


100%|██████████| 1045/1045 [00:00<00:00, 1770.26it/s]


category/news/


100%|██████████| 18684/18684 [00:09<00:00, 1882.59it/s]


category/externe/


100%|██████████| 5820/5820 [00:01<00:00, 3308.14it/s]


category/life/


100%|██████████| 1209/1209 [00:00<00:00, 3796.58it/s]


category/politica/


100%|██████████| 26386/26386 [00:21<00:00, 1221.43it/s]


category/opinii/


100%|██████████| 207/207 [00:00<00:00, 470.70it/s]


category/money/


100%|██████████| 260/260 [00:00<00:00, 1589.16it/s]


stiri/


100%|██████████| 4176/4176 [00:00<00:00, 6633.80it/s]


category/curiozitati-staiti-ca/


100%|██████████| 17/17 [00:00<00:00, 200.99it/s]


category/actualitate/


100%|██████████| 5/5 [00:00<00:00, 82.92it/s]


category/stiri-din-romania/


100%|██████████| 14/14 [00:00<00:00, 180.17it/s]


category/mistere-de-pe-mapamond/


100%|██████████| 5/5 [00:00<00:00, 62.15it/s]


category/sanatate-si-medicina/


100%|██████████| 9/9 [00:00<00:00, 125.77it/s]


category/stiri-de-pe-mapamond/


100%|██████████| 4/4 [00:00<00:00, 60.67it/s]


category/stirile-zilei/


100%|██████████| 13793/13793 [00:03<00:00, 3920.98it/s]


category/stiri-sanatate-stiri-fitness-stiri-slabire/


100%|██████████| 2241/2241 [00:00<00:00, 4191.33it/s]


category/stiri-internationale/


100%|██████████| 1080/1080 [00:00<00:00, 3277.57it/s]


category/timp-liber/


100%|██████████| 704/704 [00:00<00:00, 2440.55it/s]


category/stiri-politica/


100%|██████████| 811/811 [00:00<00:00, 2481.16it/s]


category/utile/


100%|██████████| 1257/1257 [00:00<00:00, 5044.77it/s]


covid/


100%|██████████| 234/234 [00:00<00:00, 2490.05it/s]


# TF-IDF

In [8]:
import tqdm
import sklearn.feature_extraction.text as skltext

In [11]:
tfidf_domain = []
for domain in tqdm.tqdm(df["domain"].unique()):
    
    vectorizer = skltext.TfidfVectorizer()
    tfidf = vectorizer.fit_transform(df[df.domain == domain]["content"])

    tfidf_domain.append(
        {
            "domain": domain,
            "tfidf": tfidf,
            "type": "valid" if valid_or_fake(domain) else "fake",
            "vectorizer": vectorizer
        }
    )

100%|██████████| 33/33 [01:31<00:00,  2.77s/it]


In [12]:
# import gzip
# import pickle
# with gzip.open("../models/pickles/tfidf_per_domain_all.gzip", "wb") as o:
#     pickle.dump(tfidf_domain, o)

In [None]:
import gzip
import pickle
with gzip.open("../models/pickles/tfidf_per_domain_all.gzip", "rb") as o:
    tfidf_domain = pickle.load(o)

In [13]:
df_total: pd.DataFrame = pd.DataFrame(columns=["TF-IDF", "features", "domain", "type"])

for domain in tfidf_domain:
    tfidf_list = domain["tfidf"][0].T.todense()
    df = pd.DataFrame(
        list(
            zip(
                tfidf_list,
                domain["vectorizer"].get_feature_names(),
                [domain["domain"] for _ in range(len(tfidf_list))],
                [domain["type"] for _ in range(len(tfidf_list))],
            )
        ),
        columns=["TF-IDF", "features", "domain", "type"],
    )
    df = df.sort_values("TF-IDF", ascending=False)
    df_ = df.head(25)
    df_["TF-IDF"] = [elem.item(0) for elem in df_["TF-IDF"]]

    df_total = pd.concat([df_total, df_])



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
for domain in df_total["domain"].unique():
    fig = px.histogram(
        df_total[df_total.domain == domain],
        x="domain",
        y="TF-IDF",
        color="features",
        text_auto=".2s",
        barmode="group",
        title=f"{domain}",
    )
    fig.write_html(f"../visualization/charts/TFIDF_{domain.replace('/', '_')}_NER_tag.html")
    fig.write_image(f"../visualization/charts/TFIDF_{domain.replace('/', '_')}_NER_tag.png")



# Plot POS

In [16]:
import tqdm

In [22]:
nlp = spacy.load("ro_core_news_sm")

domain_word_count = []
for domain in df["domain"].unique():
    print(domain)
    pos_object_frequecy: dict[str, int] = {}
    pos_tag_object: dict[str, list[str]] = {}

    for news in tqdm.tqdm(df[df.domain == domain]["content"]):
        doc = nlp(news)
        for ent in doc:

            if ent.text not in pos_object_frequecy.keys():
                pos_object_frequecy[ent.text] = 1
            else:
                pos_object_frequecy[ent.text] += 1

            if ent.pos_ not in pos_tag_object.keys():
                pos_tag_object[ent.pos_] = [ent.text]
            else:
                pos_tag_object[ent.pos_].append(ent.text)
    domain_word_count.append(
        {
            "domain": domain,
            "pos_object_frequecy": pos_object_frequecy,
            "pos_tag_object": pos_tag_object,
            "type": "valid" if valid_or_fake(domain) else "fake",
        }
    )


KeyboardInterrupt: 

In [81]:
# import gzip
# import pickle
# with gzip.open("../models/pickles/pos_per_domain.gzip", "wb") as o:
#     pickle.dump(domain_word_count, o)

In [17]:
import gzip
import pickle
with gzip.open("../models/pickles/pos_per_domain.gzip", "rb") as o:
    domain_word_count_valid = pickle.load(o)

In [18]:
import gzip
import pickle
with gzip.open("../models/pickles/pos_per_domain.gzip", "rb") as o:
    domain_word_count_fake = pickle.load(o)

In [19]:
domain_word_count = domain_word_count_valid

In [20]:
domain_word_count.extend(domain_word_count_fake)

In [21]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )

fig = px.histogram(domains, x="tag", y="tag_length", color="domain", text_auto=".2s")
fig.write_html(f"../visualization/charts/POS_domain_count_per_tag.html")
fig.write_image(f"../visualization/charts/POS_domain_count_per_tag.png")


In [None]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/POS_tag_count_per_domain.html")
fig.write_image(f"../visualization/charts/POS_tag_count_per_domain.png")

In [23]:
domains = []
for domain in domain_word_count:
    for tag in domain["pos_tag_object"]:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "tag": tag,
                "tag_length": len(domain["pos_tag_object"][tag]),
            }
        )
        
fig = px.histogram(
    domains, x="domain", y="tag_length", color="tag", text_auto=".2s", barmode='group'
)
fig.write_html(f"../visualization/charts/POS_tag_count_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/POS_tag_count_per_domain_groupbar.png")

In [24]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1],  reverse=True))
    index: int = 10

    for word in sorted_tags:
        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s"
)
fig.write_html(f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain.html")
fig.write_image(f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain.png")

In [25]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["pos_tag_object"]:
            if word in domain["pos_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break


fig = px.histogram(
    domains, x="domain", y="word_freq", color="word", text_auto=".2s", barmode='group'
)
fig.write_html(f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain_groupbar.html")
fig.write_image(f"../visualization/charts/POS_top_10_words_by_frequency_per_tag_per_domain_groupbar.png")

In [26]:
domains = []
for domain in domain_word_count:
    sorted_tags = dict(
        sorted(domain["pos_object_frequecy"].items(), key=lambda item: item[1], reverse=True)
    )
    index: int = 10

    for word in sorted_tags:
        tag_: str = ""
        for tag in domain["pos_tag_object"]:
            if word in domain["pos_tag_object"][tag]:
                tag_ = tag
                break

        domains.append(
            {
                "domain": domain["domain"],
                "type": domain["type"],
                "word": word,
                "word_freq": domain["pos_object_frequecy"][word],
                "tag": tag_,
            }
        )
        index -= 1
        if index == 0:
            break

fig = px.histogram(domains, x="domain", y="word_freq", color="tag", text_auto=".2s", barmode='group')
fig.write_html(f"../visualization/charts/POS_top_10_words_by_frequency_per_domain_per_tag_groupbar.html")
fig.write_image(f"../visualization/charts/POS_top_10_words_by_frequency_per_domain_per_tag_groupbar.png")