# Preprocesare venue pentru doc2vec
- Preprocesare fisere 0, 1, 2 si 3
    - eliminare cuvinte cu lungimea mai mica sau egala cu 3
    - eliminare stop_words
    - creare lista de cuvinte 
- Elimina venue duplicate
- Elimina semnele de punctuatie
- Elimina cuvintele lipsite de informatie si cele care nu sunt in engleza
- Scrie lista rezultata in fisier


In [1]:
import json # folosit pentru citire fisiere
import gensim # folosit pentru eliminare stop_words
import itertools # folosit la sortare

## Preprocesare fisiere

In [4]:
def preprocess_files(inFile, outFile):
    r = open(inFile,'r',encoding='utf-8')
    w = open(outFile,'w',encoding='utf-8')

    for line in r:
        crt_paper = json.loads(line)
        venue = crt_paper['venue']
        # Transformarea in litere mici
        venue = venue.lower()
        words_list = venue.split(' ')
        new_list = []

        for word in words_list: 
            # Verifica lungimea cuvantului (>= 3)
            if len(word) >= 3:
                # Eliminarea stop_words
                if word not in gensim.parsing.preprocessing.STOPWORDS: 
                        new_list.append(word)

        crt_paper['venue'] = new_list
        w.write(json.dumps(crt_paper))
        w.write('\n')

    w.close()
    r.close()

In [6]:
preprocess_files('../dblp-ref/dblp-final-0.json', 'dblp-venue-0.json')
print("file0 done\n1 running...")
preprocess_files('../dblp-ref/dblp-final-1.json', 'dblp-venue-1.json')
print("file1 done\n2 running...")
preprocess_files('../dblp-ref/dblp-final-2.json', 'dblp-venue-2.json')
print("file2 done\n3 running...")
preprocess_files('../dblp-ref/dblp-final-3.json', 'dblp-venue-3.json')
print("all files done")

file0 done
1 running...
file1 done
2 running...
file2 done
3 running...
all files done


## Pune toate venue intr-o lista

In [53]:
def read_from_file(file, venues):
    r = open(file,'r',encoding='utf-8')
    for line in r:
        crt_paper = json.loads(line)
        venue = crt_paper['venue']
        venues.append(venue)
    r.close()
    
# Lista de liste cu venue
svenues = []
read_from_file('dblp-venue-0.json',svenues)
read_from_file('dblp-venue-1.json',svenues)
read_from_file('dblp-venue-2.json',svenues)
read_from_file('dblp-venue-3.json',svenues)

In [51]:
print(len(svenues))

1975474


## Sorteaza lista de venue

In [90]:
def sort_and_remove_duplicates_venues(venues):
    venues.sort()
    sort_venues = list(venues for venues,_ in itertools.groupby(venues))
    return sort_venues

venues = sort_and_remove_duplicates_venues(svenues)
print(len(venues))

4239


## Elimina empty venues

In [91]:
def remove_empty_venues(venues):
    i = 0
    for venue in venues:
        if venue == [] :
            i += 1
        else:
            break
    return venues[i:]
        
venues = remove_empty_venues(venues)
print(len(venues))

4238


## Elimina cuvinte comune si care nu sunt in engleza

In [92]:
# Citeste lista de cuvinte din fisier
with open('common_words_list.json','r') as f:    
    common_words = json.loads(f.read())
print("len common words list = " + str(len(common_words)))

len common words list = 408


In [93]:
def remove_common_words(venues):
    new_venues = []
    com_w = 0
    all_w = 0
    for word_list in venues:
        aux_list = []
        for word in word_list:
            all_w += 1
            if word in common_words:
                com_w += 1
                continue
            if (not word[len(word)-1].isalpha()):
                aux_list.append(word[0:len(word)-1])
            else:
                if (not word[0].isalpha()):
                    aux_list.append(word[1:])
                else:
                    aux_list.append(word)
        new_venues.append(aux_list)
    print(all_w)
    print(com_w)
    return new_venues

venues2 = remove_common_words(venues)
print(len(venues))

15583
3827
4238


## Creare liste unice din listele ramase

In [94]:
venues2.sort()
venues_unique = list(venues2 for venues2,_ in itertools.groupby(venues2))

In [95]:
venues = remove_empty_venues(venues_unique)
with open('venues_unique.json','w') as f:
    f.write(json.dumps(venues))