In [1]:
import logging
import pickle
import time
from statistics import mean, median

import en_core_web_sm
import gensim.utils
import gensim.models.keyedvectors
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import seaborn as sns
import spacy
import string

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from imblearn.under_sampling import RandomUnderSampler
from keras_preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
nltk.download('punkt')

logging.basicConfig(format="%(message)s")
logger = logging.getLogger()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kika\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def text_preprocessing(text):  
    # Prevediem písmená slova na malé písmená
    text = text.lower()
    
    # Odfiltrujem text
    text = re.sub(r"(@\[A-Za-zÀ-ž0-9]+)|([^0-9A-Za-zÀ-ž \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    
    # Odstránim slová, ktoré obsahujú čísla
    text = re.sub(r'\w*\d\w*', '', text)

    # Odstránim čísla samotné
    text = re.sub(r'[0-9]+', '', text)

    # Odstránim všetky biele znaky
    text = re.sub(r'\s+', ' ', text).strip()

    # Prevediem text vety  
    text = word_tokenize(text)
        
    # Ak sú nejaké prázdne stringy, tak ich odstránim
    text = [x for x in text if x!='']

    # Odstránim slov slová.
    text = [x for x in text if x not in stop_words]
    
    # Zlemmatizujem.
    text = [lemmatizer.lemmatize(x) for x in text]
    
    return text

## 1 - Výber stĺpca content a uloženie do súboru

In [4]:
#https://www.kaggle.com/code/anthonyc1/gathering-real-news-for-oct-dec-2016/data
df = pd.read_csv("files/real_news.csv")
df["content"].to_csv('files/real_news.txt', index=False, header=False, encoding='utf-8')

## ---------------------------------------------------------------------------------------------------

## 2 - Predspracovanie textu

In [7]:
input1 = open('files/real_news.txt',"r",encoding='utf-8').read().replace("\n"," ")
lines = nltk.sent_tokenize(input1)

In [8]:
sentences = []
for line in lines:
    line = text_preprocessing(line)
    #Ak nie je prázdny výstup ([]), tak ho pridaj
    if line:
        sentences.append(line)

In [9]:
print("Počet viet:",len(sentences))

Počet viet: 704357


In [10]:
# Náhodne vyberie 50 000 viet z celkového počtu viet
# import random
# random_sample = random.sample(sentences, 50000)

In [11]:
#Uloženie viet
with open("files/sentences", "wb") as fp:   #uložím len 10k viet
    pickle.dump(sentences[:10000], fp)
    
# with open("files/sentences", "wb") as fp:
#     pickle.dump(random_sample, fp)

## ---------------------------------------------------------------------------------------------------

## 4 - Duplikácia viet, náhrada slov jeho synonymami (potrebujem synonymá B) 3. krok)

## ---------------------------------------------------------------------------------------------------

In [12]:
#Načítanie viet
with open("files/sentences", "rb") as fp:   # Unpickling
    sentences = pickle.load(fp)

In [13]:
#Načítanie zoznamu synoným
with open("files/synonyms", "rb") as fp:   # Unpickling
    syn = pickle.load(fp)

In [14]:
def replace_synonyms(sentences, synonyms):
    new_sentences = []
    for sentence in sentences:
        new_sentences.append(sentence)
        for i, word in enumerate(sentence):
            if word in synonyms:
                for syn in synonyms[word]:
                    new_sentence = sentence.copy()
                    new_sentence[i] = syn
                    new_sentences.append(new_sentence)
    return new_sentences

In [15]:
sentences_new = replace_synonyms(sentences,syn)

In [16]:
print(len(sentences), len(sentences_new))

10000 175354


In [17]:
#Uloženie viet
with open("files/sentences_new", "wb") as fp:   #Pickling
    pickle.dump(sentences_new, fp)