# Preprocesarea datelor
1. Transformarea in litere mici
2. Verifica lungimea cuvantului (> 3)
3. Verifica si elimina caractere de inceput si de sfarsit
4. Eliminarea cuvintelor de tip zgomot
5. Eliminarea stop_words
6. Verificarea partii de vorbire
7. Aplica lemmatizare
8. Eliminarea cuvintelor ce au numarul de aparitii sub si peste treshlod

## Imports

In [2]:
from collections import Counter 
from nltk.corpus import wordnet # To get words in dictionary with their parts of speech
from nltk.stem import WordNetLemmatizer # lemmatizes word based on it's parts of speech

import json
import gensim 
import pickle
import string

## 1. Eliminarea cuvintelor de tip zgomot

In [1]:
def preprocess_word(word):
    special_chars_start = ['\'','(', '"', ':', '“']
    special_chars_end = ['\'',')', '.', ',', '"', ':', ';' , '”']
    if word[0] in special_chars_start:
        word = word[1:len(word)]
        
    if word[len(word) - 1] in special_chars_end:
        word = word[0:len(word)-1]
    return word.replace("-","")

def is_noise(word):    
    for c in word:            
        if c not in string.ascii_lowercase and c != '-':
            return True
    return False


## 4. Verificare partii de vorbire

In [3]:
def get_pos( word ):
    w_synsets = wordnet.synsets(word)

    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in w_synsets if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in w_synsets if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in w_synsets if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in w_synsets if item.pos()=="r"]  )
    
    most_common_pos_list = pos_counts.most_common(3)
    return most_common_pos_list[0][0]

def is_noun(pos):
    if pos == "n":
        return True
    return False

def is_verb(pos):
    if pos == "v":
        return True
    return False

## 5. Lemmatizare

In [4]:
def lemmatize_word(word, pos):
    wnl = WordNetLemmatizer()
    return (wnl.lemmatize( word, pos ))

# Apelarea preprocesarii pentru abstracte

### Fisierul 0

In [5]:
r = open('../../DBLP/dblp-ref/dblp-final-0.json','r',encoding='utf-8')
w = open('dblp-vn-0.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True or is_verb(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

0
100000
200000
300000
400000
500000


### Fisierul 1

In [6]:
r = open('../../DBLP/dblp-ref/dblp-final-1.json','r',encoding='utf-8')
w = open('dblp-vn-1.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True or is_verb(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

0
100000
200000
300000
400000
500000
600000
700000


### Fisierul 2

In [7]:
r = open('../../DBLP/dblp-ref/dblp-final-2.json','r',encoding='utf-8')
w = open('dblp-vn-2.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True or is_verb(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

0
100000
200000
300000
400000
500000
600000


### Fisierul 3

In [8]:
r = open('../../DBLP/dblp-ref/dblp-final-3.json','r',encoding='utf-8')
w = open('dblp-vn-3.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True or is_verb(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

0


## Creare lista de cuvinte unice din fisierul 0

In [9]:
word_list0 = []
r = open('dblp-vn-0.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    word_list0.extend(abstract)
    
r.close()

## Eliminare duplicate
word_list0 = list(dict.fromkeys(word_list0))

## Scriere lista 0 in fisier

In [10]:
with open('list-vn0.json','wb') as w:
    pickle.dump(word_list0, w)

In [11]:
print(len(word_list0))

433159


## Creare lista de cuvinte unice din fisierul 1

In [12]:
word_list1 = []
r = open('dblp-vn-1.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    word_list1.extend(abstract)

r.close()

## Eliminare duplicate
word_list1 = list(dict.fromkeys(word_list1))

## Scriere lista 1 in fisier

In [13]:
with open('list-vn1.json','wb') as w:
    pickle.dump(word_list1, w)

In [14]:
print(len(word_list1))

518426


## Creare lista de cuvinte unice din fisierul 2

In [15]:
word_list2 = []
r = open('dblp-vn-2.json','r',encoding='utf-8')
for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    word_list2.extend(abstract)

r.close()

## Eliminare duplicate
word_list2 = list(dict.fromkeys(word_list2))

## Scriere lista 2 in fisier

In [16]:
with open('list-vn2.json','wb') as w:
    pickle.dump(word_list2, w)

In [17]:
print(len(word_list2))

487270


## Creare lista de cuvinte unice din fisierul 3

In [18]:
word_list3 = []
r = open('dblp-vn-3.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    word_list3.extend(abstract)

r.close()

## Eliminare duplicate
word_list3 = list(dict.fromkeys(word_list3))

## Scriere lista 3 in fisier

In [19]:
with open('list-vn3.json','wb') as w:
    pickle.dump(word_list3, w)

In [20]:
print(len(word_list3))

64073


## Combinarea listelor

In [21]:
# Citire liste
wlist0 = []
wlist1 = []
wlist2 = []
wlist3 = []
with open('list-vn0.json','rb') as r:
    wlist0 = pickle.load(r)
with open('list-vn1.json','rb') as r:
    wlist1 = pickle.load(r)
with open('list-vn2.json','rb') as r:
    wlist2 = pickle.load(r)
with open('list-vn3.json','rb') as r:
    wlist3 = pickle.load(r)

# Combinarea liste 0 cu 1
wlist0[len(wlist0):] = wlist1 

# Combinarea liste 2 cu 3
wlist2[len(wlist2):] = wlist3 

# Combinarea listei 01 cu lista 23
wlist0[len(wlist0):] = wlist2

# Eliminare duplicate
list_final = []
list_final = list(dict.fromkeys(wlist0))

#Scrie in fisier lista finala
with open('list-vn-final.json','wb') as w:
    pickle.dump(list_final, w)
    
NUM_OF_WORDS = len(list_final)

In [34]:
print(NUM_OF_WORDS)

988791


## Creare dictionar de cuvinte

In [22]:
# Citire lista de cuvinte
word_list = []
with open('list-vn-final.json','rb') as r:
    word_list = pickle.load(r)
    
lista_index = []
for i in range(NUM_OF_WORDS):
    lista_index.append(i)
dict_words = dict(zip(word_list,lista_index))

# Scrie in fisier dictionarul
with open('dict-vn.json','wb') as w:
    pickle.dump(dict_words, w)

## Creare lista de aparitii

In [23]:
lista_aparitii = []
for i in range(NUM_OF_WORDS):
    lista_aparitii.append(0)

### Parcurge fisierul 0 si incrementeaza lista de aparitii

In [24]:
r = open('dblp-vn-0.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    for word in abstract:
        lista_aparitii[dict_words[word]] = lista_aparitii[dict_words[word]] + 1 

r.close()

### Parcurge fisierul 1 si incrementeaza lista de aparitii

In [25]:
r = open('dblp-vn-1.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    for word in abstract:
        lista_aparitii[dict_words[word]] = lista_aparitii[dict_words[word]] + 1 

r.close()

### Parcurge fisierul 2 si incrementeaza lista de aparitii

In [26]:
r = open('dblp-vn-2.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    for word in abstract:
        lista_aparitii[dict_words[word]] = lista_aparitii[dict_words[word]] + 1 

r.close()

### Parcurge fisierul 3 si incrementeaza lista de aparitii

In [27]:
r = open('dblp-vn-3.json','r',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    for word in abstract:
        lista_aparitii[dict_words[word]] = lista_aparitii[dict_words[word]] + 1 

r.close()

### Scrie in fisier lista de aparitii

In [28]:
with open('lista-aparitii-vn.json','wb') as w:
    pickle.dump(lista_aparitii, w)

# 8. Eliminarea cuvintelor ce au numarul de aparitii sub si peste treshlod

In [29]:
lower_bound = 3
upper_bound = 500000

In [30]:
# Citire lista de aparitii
lista_aparitii = []
with open('lista-aparitii-vn.json','rb') as r:
    lista_aparitii = pickle.load(r)
    
# Citire dictionar
dict_words = []
with open('dict-vn.json','rb') as r:
    dict_words = pickle.load(r)
    
# Eliminare cuvintelor din fisierul 0
r = open('dblp-vn-0.json','r',encoding='utf-8')
w = open('dblp-reduced-vn-0.json','w',encoding='utf-8')
for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']

    new_abstract = []
    for word in abstract:
        if (lista_aparitii[dict_words[word]] >= lower_bound and lista_aparitii[dict_words[word]] <= upper_bound):
            new_abstract.append(word)

    crt_paper['abstract'] = new_abstract
    w.write(json.dumps(crt_paper))
    w.write('\n')
    
w.close()
r.close()

# Eliminare cuvintelor din fisierul 1
r = open('dblp-vn-1.json','r',encoding='utf-8')
w = open('dblp-reduced-vn-1.json','w',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']

    new_abstract = []
    for word in abstract:
        if (lista_aparitii[dict_words[word]] >= lower_bound and lista_aparitii[dict_words[word]] <= upper_bound):
            new_abstract.append(word)

    crt_paper['abstract'] = new_abstract
    w.write(json.dumps(crt_paper))
    w.write('\n')
    
w.close()
r.close()

# Eliminare cuvintelor din fisierul 2
r = open('dblp-vn-2.json','r',encoding='utf-8')
w = open('dblp-reduced-vn-2.json','w',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']

    new_abstract = []
    for word in abstract:
        if (lista_aparitii[dict_words[word]] >= lower_bound and lista_aparitii[dict_words[word]] <= upper_bound):
            new_abstract.append(word)

    crt_paper['abstract'] = new_abstract
    w.write(json.dumps(crt_paper))
    w.write('\n')
    
w.close()
r.close()


# Eliminare cuvintelor din fisierul 3
r = open('dblp-vn-3.json','r',encoding='utf-8')
w = open('dblp-reduced-vn-3.json','w',encoding='utf-8')

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']

    new_abstract = []
    for word in abstract:
        if (lista_aparitii[dict_words[word]] >= lower_bound and lista_aparitii[dict_words[word]] <= upper_bound):
            new_abstract.append(word)

    crt_paper['abstract'] = new_abstract
    w.write(json.dumps(crt_paper))
    w.write('\n')
    
w.close()
r.close()

In [31]:
## Calcularea listei reduse 

# Citirea listei de aparitii
lista_aparitii = []
with open('lista-aparitii-vn.json','rb') as r:
    lista_aparitii = pickle.load(r)

# Citirea listei de cuvinte
lista_cuvinte = []
with open('list-vn-final.json','rb') as r:
    lista_cuvinte = pickle.load(r)
    
# Citirea dictionarului de cuvinte
dict_words = []
with open('dict-vn.json','rb') as r:
    dict_words = pickle.load(r)
    
# Calcularea noii liste
new_list = []
for word in lista_cuvinte:
    if (lista_aparitii[dict_words[word]] >= lower_bound and lista_aparitii[dict_words[word]] <= upper_bound):
        new_list.append(word)
        
# Scrie lista redusa in fisier
with open('list-reduced-vn.json','wb') as w:
    pickle.dump(new_list, w)

In [32]:
print(len(new_list))

343588


## Afisare statistici

In [5]:
## Calcularea listei reduse 

# Citirea listei de aparitii
lista_aparitii = []
with open('lista-aparitii-vn.json','rb') as r:
    lista_aparitii = pickle.load(r)

# Citirea listei de cuvinte
lista_cuvinte = []
with open('list-vn-final.json','rb') as r:
    lista_cuvinte = pickle.load(r)
    
# Citirea dictionarului de cuvinte
dict_words = []
with open('dict-vn.json','rb') as r:
    dict_words = pickle.load(r)
    
# Inserarea ordonata intr-o noua lista de liste
listl = []
i = 0
for word in lista_cuvinte:   
    listl.append([lista_aparitii[dict_words[word]],word])
    
from operator import itemgetter
sl = sorted(listl, key=itemgetter(0))

most_frecv = sl[len(sl)-50:]
print(most_frecv[::-1])

least_frecv = sl[0:50]
print(least_frecv)
for elem in least_frecv:
    print(elem[1])
    print(",")

[[1358651, 'model'], [1315297, 'propose'], [1240719, 'paper'], [1135446, 'method'], [1119199, 'algorithm'], [1079469, 'result'], [1025689, 'data'], [912367, 'network'], [902944, 'problem'], [880822, 'base'], [811460, 'present'], [808309, 'approach'], [692483, 'performance'], [617371, 'time'], [607517, 'process'], [600900, 'design'], [583243, 'information'], [572042, 'provide'], [568284, 'system'], [554874, 'study'], [547454, 'application'], [503180, 'image'], [485642, 'technique'], [472185, 'analysis'], [456653, 'user'], [441572, 'number'], [414262, 'control'], [388047, 'scheme'], [380112, 'feature'], [369423, 'function'], [356049, 'develop'], [352622, 'work'], [342529, 'test'], [340214, 'learn'], [337686, 'simulation'], [333818, 'order'], [332988, 'solution'], [326709, 'improve'], [323714, 'compare'], [320294, 'show'], [317988, 'structure'], [317644, 'consider'], [316016, 'service'], [302706, 'case'], [289518, 'apply'], [286910, 'give'], [284610, 'achieve'], [284138, 'obtain'], [28046