# Preprocesarea datelor
1. Transformarea in litere mici
2. Verifica lungimea cuvantului (> 3)
3. Verifica si elimina caractere de inceput si de sfarsit
4. Eliminarea cuvintelor de tip zgomot
5. Eliminarea stop_words
6. Verificarea partii de vorbire
7. Aplica lemmatizare

## Imports

In [2]:
from collections import Counter 
from nltk.corpus import wordnet # To get words in dictionary with their parts of speech
from nltk.stem import WordNetLemmatizer # lemmatizes word based on it's parts of speech

import json
import gensim 
import pickle
import string

## Eliminarea cuvintelor de tip zgomot

In [3]:
def preprocess_word(word):
    special_chars_start = ['\'','(', '"', ':', '“']
    special_chars_end = ['\'',')', '.', ',', '"', ':', ';' , '”']
    if word[0] in special_chars_start:
        word = word[1:len(word)]
        
    if word[len(word) - 1] in special_chars_end:
        word = word[0:len(word)-1]
    return word.replace("-","")

def is_noise(word):    
    for c in word:            
        if c not in string.ascii_lowercase and c != '-':
            return True
    return False


## Verificare partii de vorbire

In [4]:
def get_pos( word ):
    w_synsets = wordnet.synsets(word)

    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in w_synsets if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in w_synsets if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in w_synsets if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in w_synsets if item.pos()=="r"]  )
    
    most_common_pos_list = pos_counts.most_common(3)
    return most_common_pos_list[0][0]

def is_noun(pos):
    if pos == "n":
        return True
    return False

def is_verb(pos):
    if pos == "v":
        return True
    return False

## Lemmatizare

In [5]:
def lemmatize_word(word, pos):
    wnl = WordNetLemmatizer()
    return (wnl.lemmatize( word, pos ))

# Apelarea preprocesarii pentru abstracte

### Fisierul 0

In [6]:
r = open('../../DBLP/dblp-ref/dblp-final-0.json','r',encoding='utf-8')
w = open('dblp-n-0.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

0
100000
200000
300000
400000
500000


### Fisierul 1

In [None]:
r = open('../../DBLP/dblp-ref/dblp-final-1.json','r',encoding='utf-8')
w = open('dblp-n-1.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

### Fisierul 2

In [None]:
r = open('../../DBLP/dblp-ref/dblp-final-2.json','r',encoding='utf-8')
w = open('dblp-n-2.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

### Fisierul 3

In [None]:
r = open('../../DBLP/dblp-ref/dblp-final-3.json','r',encoding='utf-8')
w = open('dblp-n-3.json','w',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    # 1. Transformarea in litere mici
    abstract = abstract.lower()
    words_list = abstract.split(' ')
    new_list = []

    for word in words_list: 
        # 2. Verifica lungimea cuvantului (> 3)
        if len(word) > 3:
            # 3. Verifica si elimina caractere de inceput, de sfarsit si cratimele 
            new_word = preprocess_word(word)
            
            # 4. Eliminarea cuvintelor de tip zgomot
            if is_noise(new_word) == False:
                
                # 5. Eliminarea stop_words
                if new_word not in gensim.parsing.preprocessing.STOPWORDS: 
                    
                    # 6. Verificarea partii de vorbire
                    pos = get_pos(new_word)
                    if is_noun(pos) == True:
                    
                        # 7. Aplica lemmatizare
                        new_word = lemmatize_word(new_word, pos)
                        new_list.append(new_word)
    
    crt_paper['abstract'] = new_list
    w.write(json.dumps(crt_paper))
    w.write('\n')
    if i % 100000 == 0:
        print (i)
    i+=1
    
w.close()
r.close()

## Creare lista de cuvinte unice din fisierul 0

In [7]:
word_list0 = []
r = open('dblp-n-0.json','r',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    word_list0.extend(abstract)
    if i % 50000 == 0:
        print (i)
    i+=1

r.close()

## Eliminare duplicate
word_list0 = list(dict.fromkeys(word_list0))


0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000


## Scriere lista 0 in fisier

In [8]:
with open('list-n0.json','wb') as w:
    pickle.dump(word_list0, w)

In [9]:
print(len(word_list0))

429391


## Creare lista de cuvinte unice din fisierul 1

In [None]:
word_list1 = []
r = open('dblp-n-1.json','r',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    word_list1.extend(abstract)
    if i % 50000 == 0:
        print (i)
    i+=1

r.close()

## Eliminare duplicate
word_list1 = list(dict.fromkeys(word_list1))


## Scriere lista 1 in fisier

In [None]:
with open('list-n1.json','wb') as w:
    pickle.dump(word_list1, w)

In [None]:
print(len(word_list1))

## Creare lista de cuvinte unice din fisierul 2

In [None]:
word_list2 = []
r = open('dblp-n-2.json','r',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    word_list2.extend(abstract)
    if i % 50000 == 0:
        print (i)
    i+=1

r.close()

## Eliminare duplicate
word_list2 = list(dict.fromkeys(word_list2))


## Scriere lista 2 in fisier

In [None]:
with open('list-n2.json','wb') as w:
    pickle.dump(word_list2, w)

In [None]:
print(len(word_list2))

## Creare lista de cuvinte unice din fisierul 3

In [None]:
word_list3 = []
r = open('dblp-n-3.json','r',encoding='utf-8')
i = 0

for line in r:
    crt_paper = json.loads(line)
    abstract = crt_paper['abstract']
    
    word_list3.extend(abstract)
    if i % 50000 == 0:
        print (i)
    i+=1

r.close()

## Eliminare duplicate
word_list3 = list(dict.fromkeys(word_list3))


## Scriere lista 3 in fisier

In [None]:
with open('list-n3.json','wb') as w:
    pickle.dump(word_list3, w)

In [None]:
print(len(word_list3))

## Combinarea listelor

In [10]:
# Citire liste
wlist0 = []
wlist1 = []
wlist2 = []
wlist3 = []
with open('list-n0.json','rb') as r:
    wlist0 = pickle.load(r)
with open('list-n1.json','rb') as r:
    wlist1 = pickle.load(r)
with open('list-n2.json','rb') as r:
    wlist2 = pickle.load(r)
with open('list-n3.json','rb') as r:
    wlist3 = pickle.load(r)

# Combinarea liste 0 cu 1
wlist0[len(wlist0):] = wlist1 

# Combinarea liste 2 cu 3
wlist2[len(wlist2):] = wlist3 

# Combinarea listei 01 cu lista 23
wlist0[len(wlist0):] = wlist2

# Eliminare duplicate
list_final = []
list_final = list(dict.fromkeys(wlist0))

#Scrie in fisier lista finala
with open('list-n-final.json','wb') as w:
    pickle.dump(list_final, w)
    
NUM_OF_WORDS = len(list_final)

## Creare dictionar de cuvinte

In [11]:
# Citire lista de cuvinte
word_list = []
with open('list-n-final.json','rb') as r:
    word_list = pickle.load(r)
    
lista_index = []
for i in range(NUM_OF_WORDS):
    lista_index.append(i)
dict_words = dict(zip(word_list,lista_index))

# Scrie in fisier dictionarul
with open('dict-n.json','wb') as w:
    pickle.dump(dict_words, w)