# Editing the data
___

This file has been created to process the data file,  "videogames.json", and perform some basic operations. Also, his functions needs to be called before any other function in this script, Creates metadata and preprocesses the dataset in order to recover the information for the app.

### Loading the original dataset
---

In [1]:
import json

Videogames =""
with open('../../data/videogames.json','r') as data:
    for letter in data:    
        Videogames+=letter
Videogames=json.loads(Videogames)
print(len(Videogames))

20803


### Removing the missing plot
---

Some videogames doesn't have plots, instead, has the text: 'Add a Plot'. This block remove this text from all the games who has it.

In [2]:
print(len([a for a in Videogames if a['plot']=='Add a Plot']))

for a in Videogames:
    if a['plot'] == 'Add a Plot':
        a['plot']=''

print(len([a for a in Videogames if a['plot']=='Add a Plot']))
print(len([a for a in Videogames if a['plot']=='']))

8874
0
8874


### Removing the continous text from the long plots
---

Some plots are to large, and contains the text 'See full summary »' at the end of the plot. This is not ideal for presentation purposes. This block remove this messege

In [3]:
import string

errors=set()

for game in Videogames:
    game['plot']=game['plot'].replace('See full summary »','')
for game in Videogames:
    if game['plot'].__contains__('See full summary »'):
        errors.add(game['plot'])

print(len(errors))

0


### Removing duplicated elements in dataset
---

Some games are repited

In [4]:
aux=[]
setVideoGames=set()
for game in Videogames:
    if str(game) in setVideoGames:
        continue
    aux.append(game)
    setVideoGames.add(str(game))
#Removing the duplicates and converting them to dictionaries.
print(len(Videogames))
print(len(aux))
Videogames=aux

20803
20237


### Saving the edited dataset
---

Create a new json witdh the information edited

In [10]:
with open('../../data/videogames_edited.json','w') as file:
    json.dump(Videogames,file)

# Creating the vocabulary
---

### Declaring auxiliar methods
---

In [35]:
import nltk
import gensim

tokenized_docs = []
vector_repr = []
dictionary = {}
vocabulary = []

def tokenization_nltk(texts):
    return [nltk.tokenize.word_tokenize(doc) for doc in texts]

def remove_noise_nltk(tokenized_docs):
    print(type(tokenized_docs[0]))
    return [[word.lower() for word in doc if word.isalpha()] for doc in tokenized_docs]

def remove_stopwords(tokenized_docs):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    return [
        [word for word in doc if word not in stop_words] for doc in tokenized_docs
    ]

def morphological_reduction_nltk(tokenized_docs, use_lemmatization=True):
    if use_lemmatization:
        lemmatizer = nltk.stem.WordNetLemmatizer()
        return [
            [lemmatizer.lemmatize(word) for word in doc]
            for doc in tokenized_docs
        ]
    else:
        stemmer = nltk.stem.PorterStemmer()
        return [
            [stemmer.stem(word) for word in doc] for doc in tokenized_docs
        ]

def filter_tokens_by_occurrence(tokenized_docs, no_below=5, no_above=0.5):
    global dictionary
    dictionary = gensim.corpora.Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)

    filtered_words = [word for _, word in dictionary.iteritems()]
    return [
        [word for word in doc if word in filtered_words]
        for doc in tokenized_docs
    ]

def vector_representation(tokenized_docs, dictionary, vector_repr, use_bow=True):
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

    if use_bow:
        vector_repr = corpus
    else:
        tfidf = gensim.models.TfidfModel(corpus)
        vector_repr = [tfidf[doc] for doc in corpus]

    return vector_repr

def pos_tagger_nltk(tokenized_docs):
    return [nltk.pos_tag(doc) for doc in tokenized_docs]


In [12]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
correspondencia_palabras = {id_: palabra for palabra, id_ in dictionary.token2id.items()}
new_corpus = []
for item in corpus:
    doc_item = []
    for id_, frecuencia in item:
        palabra = correspondencia_palabras[id_]
        doc_item.append({'Palabra': palabra, 'Identificador': id_, 'Frecuencia': frecuencia})
    new_corpus.append(doc_item)
with open('../../data/new_corpus.json','w') as file:
    json.dump(new_corpus,file)


[{'Palabra': 'city', 'Identificador': 0, 'Frecuencia': 2}, {'Palabra': 'collide', 'Identificador': 1, 'Frecuencia': 1}, {'Palabra': 'greater', 'Identificador': 2, 'Frecuencia': 1}, {'Palabra': 'love', 'Identificador': 3, 'Frecuencia': 1}, {'Palabra': 'must', 'Identificador': 4, 'Frecuencia': 1}, {'Palabra': 'new', 'Identificador': 5, 'Frecuencia': 2}, {'Palabra': 'parker', 'Identificador': 6, 'Frecuencia': 1}, {'Palabra': 'peter', 'Identificador': 7, 'Frecuencia': 1}, {'Palabra': 'rise', 'Identificador': 8, 'Frecuencia': 1}, {'Palabra': 'save', 'Identificador': 9, 'Frecuencia': 1}, {'Palabra': 'threatens', 'Identificador': 10, 'Frecuencia': 1}, {'Palabra': 'villain', 'Identificador': 11, 'Frecuencia': 1}, {'Palabra': 'world', 'Identificador': 12, 'Frecuencia': 1}, {'Palabra': 'york', 'Identificador': 13, 'Frecuencia': 1}]


### Creating the tokens, the vocabulary, the representative vector and the tags

In [36]:
texts=['"' + a['name'] + '" : ' + a['plot'] for a in Videogames]
dictionary_creator = morphological_reduction_nltk(remove_stopwords(remove_noise_nltk(tokenization_nltk(texts))))
tokenized_docs= filter_tokens_by_occurrence(dictionary_creator)
vocabulary = list(dictionary.token2id.keys())
vector_repr = vector_representation(tokenized_docs, dictionary, vector_repr,False)
pos_tags = pos_tagger_nltk(tokenized_docs)

<class 'list'>


In [40]:
with open('../../data/pos_tags.json','w') as file:
    json.dump(pos_tags,file)

with open('../../data/vector_repr.json','w') as file:
    json.dump(vector_repr,file)

with open('../../data/corpus.json','w') as file:
    json.dump([dictionary.doc2bow(doc) for doc in tokenized_docs],file)

with open('../../data/vocabulary.json','w') as file:
    json.dump(vocabulary,file)

with open('../../data/tokenized_docs.json','w') as file:
    json.dump(tokenized_docs,file)

with open('../../data/Recomendations.json','w') as file:
    json.dump([],file)