In [1]:
import re
import string
from string import punctuation
import pandas as pd
import numpy as np
import unicodedata
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_list = stopwords.words('romanian')

#using snowball for Stemming:
from nltk.stem import SnowballStemmer
from nltk.stem.snowball import RomanianStemmer
stemmer = RomanianStemmer()

#for tokens
import nltk
#nltk.download('punkt')

###########################################################################

#Lowercase
def lowercase_text(text):
    text = " ".join(x.lower() for x in text.split())
    return text

#elimina semnele de punctuatie
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(' +', ' ', text)
    return text

#elimina literele singure
def remove_single_letter(text):
    text=" ".join(c for c in text.split() if len(c)>1) 
    return text

#elimina tag-urile html
def remove_tags(text):
    text = BeautifulSoup(text, 'html.parser')
    for a in text.find_all():
        a.decompose()
    return str(text)

#elimina emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#elimina paranteze
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#elimina numerele
def remove_numbers(text):
    text = "".join(n for n in text if not n.isdigit())
    return text

#cuvinte de adaugat in stopwords
list=['le', 'am', 'ti', 'ma', 'as', 'mi', 'va', 'sa', 'pt', 'sau', 'ul', 'nul',
      'aș', 'ți', 'și', 'vă', 'aceeași','același','aceștia','adică','ăla',
      'alți','alții','atât','atâta','atâtă','atâția','bă','când','căreia',
      'cărora','căruia','către','ceilalți','cealaltă','când','celălalt','câtă',
      'câteva','câțiva','cu','dă','decât','ești','fără','față','îi','în','înapoi',
      'înainte','încă','încât','însă','între','își','îți','îl','măi','maine','multă',
      'păi','parcă','sunteți']
for w in list:
    stopwords_list.append(w)
stopwords_list.remove('nu')   
def remove_stopwords(text):
    words = text.split() 
    clean_words = [word for word in words if word not in stopwords_list] 
    return " ".join(clean_words)   
 
#schimba diacriticile in litere normale
def remove_accents(text):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

#elimina caracterele speciale
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#elimina caracterele extra - care apar mai mult de 3 ori
def remove_extra_chars(text):
    return re.sub(r'(.)\1{3,}', r'\1', text) 

# pentru vizualizarea graficelor pentru retelele neuronale
def plot_graphs(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()
    
# functie care mapeaza fiecare recenzie la un vector de caracteristici prin calcularea mediei încorporărilor de cuvinte ale tuturor cuvintelor din recenzie.
# calculeaza o medie pentru fiecare vector 
def avg_vect(vec, vector_size):
    avg_vec = []
    for v in vec:
        if v.size:
            avg_vec.append(v.mean(axis=0))
        else:
            avg_vec.append(np.zeros(vector_size, dtype=float))
    return avg_vec

def clean_text_pipe(text):
    text = lowercase_text(text)
    text = remove_tags(text)
    text = remove_punctuation(text)
    text = remove_emoji(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    text = remove_numbers(text)
    text = remove_single_letter(text)
    text = remove_accents(text)
    text = remove_special_characters(text)
    return text

In [2]:
stopwords_list

['a',
 'abia',
 'acea',
 'aceasta',
 'această',
 'aceea',
 'aceeasi',
 'acei',
 'aceia',
 'acel',
 'acela',
 'acelasi',
 'acele',
 'acelea',
 'acest',
 'acesta',
 'aceste',
 'acestea',
 'acestei',
 'acestia',
 'acestui',
 'aceşti',
 'aceştia',
 'adica',
 'ai',
 'aia',
 'aibă',
 'aici',
 'al',
 'ala',
 'ale',
 'alea',
 'alt',
 'alta',
 'altceva',
 'altcineva',
 'alte',
 'altfel',
 'alti',
 'altii',
 'altul',
 'am',
 'anume',
 'apoi',
 'ar',
 'are',
 'as',
 'asa',
 'asta',
 'astea',
 'astfel',
 'asupra',
 'atare',
 'atat',
 'atata',
 'atatea',
 'atatia',
 'ati',
 'atit',
 'atita',
 'atitea',
 'atitia',
 'atunci',
 'au',
 'avea',
 'avem',
 'aveţi',
 'avut',
 'aş',
 'aţi',
 'ba',
 'ca',
 'cam',
 'cand',
 'care',
 'careia',
 'carora',
 'caruia',
 'cat',
 'catre',
 'ce',
 'cea',
 'ceea',
 'cei',
 'ceilalti',
 'cel',
 'cele',
 'celor',
 'ceva',
 'chiar',
 'ci',
 'cind',
 'cine',
 'cineva',
 'cit',
 'cita',
 'cite',
 'citeva',
 'citi',
 'citiva',
 'cu',
 'cui',
 'cum',
 'cumva',
 'cât',
 'câte