# <center>Brief Générateur de paroles de chansons</center>

## Importation des librairies

In [252]:
import numpy as np 
import pandas as pd

In [253]:
# with open('personal_access_token.txt') as f:
#     ACCESS_TOKEN = f.readline()
#     f.close()

## Importation des datasets

In [254]:
# !git clone -b jordan https://{ACCESS_TOKEN}@github.com/jnamor/nlp_generator>.git

In [255]:
df = pd.read_csv('new-dataset.csv', sep=",")
df.head()

Unnamed: 0,Artist,Genres,Songs,Popularity,Link,SName,SLink,Lyric,language
0,4 Non Blondes,Rock,15.0,10.1,/4-non-blondes/,What's Up,/4-non-blondes/whats-up.html,Twenty-five years and my life is still\nTrying...,en
1,4 Non Blondes,Rock,15.0,10.1,/4-non-blondes/,Spaceman,/4-non-blondes/spaceman.html,Starry night bring me down\nTill I realize the...,en
2,4 Non Blondes,Rock,15.0,10.1,/4-non-blondes/,Pleasantly Blue,/4-non-blondes/pleasantly-blue.html,Every time you wake in the mornin'\nAnd you st...,en
3,4 Non Blondes,Rock,15.0,10.1,/4-non-blondes/,I'm The One,/4-non-blondes/im-the-one.html,Ah-hah!\nWoo!\nAh-ha-ha-ha-ha-ha!\nWe came her...,en
4,4 Non Blondes,Rock,15.0,10.1,/4-non-blondes/,Dear Mr. President,/4-non-blondes/dear-mr-president.html,I'm looking outside of my windows\nThe view th...,en


In [256]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4672 entries, 0 to 4671
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Artist      4672 non-null   object 
 1   Genres      4672 non-null   object 
 2   Songs       4672 non-null   float64
 3   Popularity  4672 non-null   float64
 4   Link        4672 non-null   object 
 5   SName       4672 non-null   object 
 6   SLink       4672 non-null   object 
 7   Lyric       4672 non-null   object 
 8   language    4672 non-null   object 
dtypes: float64(2), object(7)
memory usage: 328.6+ KB


In [257]:
df.shape

(4672, 9)

In [258]:
lyrics = [worm for word in df['Lyric'] for worm in word.split('\n') if len(worm) > 0 and len(worm.split(' ')) <= 12]

In [259]:
from itertools import zip_longest

def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

In [260]:
context_and_target = list(grouper(lyrics, 2, 'x'))

In [261]:
context, target = zip(*context_and_target)

## Step 2. Preprocessing for text data
### 2-1. Words

In [262]:
from collections import Counter

def countWord(list_of_words):            
    count = Counter()
    for sentence in list_of_words:
        for word in sentence.split():
            count[word] += 1
    
    return count

In [263]:
counter = countWord(lyrics)
counter.most_common(5)

[('the', 33856), ('I', 27565), ('you', 23136), ('to', 18487), ('a', 17301)]

In [264]:
MAX_LEN = max([len(x.split(' ')) for x in lyrics])
# VOCAB_SIZE = len(counter)

### 2-2. Clean Text

In [265]:
import re

def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [266]:
encoder_inputs = [clean_text(worm) for worm in context]
decoder_inputs = ["<BOS> " + clean_text(worm) + " <EOS>" for worm in target]

### 2-1. Make Vocabulary

In [267]:
complete_text = encoder_inputs + decoder_inputs

In [268]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(complete_text)
word_index = tokenizer.word_index

In [269]:
index_word = {v:k for k, v in word_index.items()}

In [270]:
VOCAB_SIZE = len(index_word) + 1

### 2-3. ONE-HOT VECTORIZER

In [305]:
encoder_sequences = tokenizer.texts_to_sequences(encoder_inputs[:len(encoder_inputs) // 3]) 
decoder_sequences = tokenizer.texts_to_sequences(decoder_inputs[:len(decoder_inputs) // 3])

In [306]:
num_samples = len(encoder_sequences) 

In [307]:
decoder_output_data = np.zeros((num_samples, MAX_LEN, VOCAB_SIZE), dtype="float32")

### 2-4. PADDING

In [308]:
from keras.preprocessing.sequence import pad_sequences

encoder_input_data = pad_sequences(encoder_sequences, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')
decoder_input_data = pad_sequences(decoder_sequences, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')

### 2-5. Word2Vec: pretrained glove vector

In [309]:
embeddings_index = {}
with open('../data/glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

In [310]:
embedding_dimention = 50

def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimention))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [311]:
embedding_matrix = embedding_matrix_creater(50, word_index=index_word)

In [312]:
from keras.layers import Embedding

embed_layer = Embedding(input_dim=VOCAB_SIZE, output_dim=50, trainable=True,)
embed_layer.build((None,))
embed_layer.set_weights([embedding_matrix])

ValueError: Layer weight shape (7523, 50) not compatible with provided weight shape (22569, 50)

### Step 3. Build Seq2Seq Model

In [None]:
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

def seq2seq_model_builder(HIDDEN_DIM=300):
    
    encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
    encoder_embedding = embed_layer(encoder_inputs)
    encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
    encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
    
    decoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
    decoder_embedding = embed_layer(decoder_inputs)
    decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)
    decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])
    
    outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer='adam', loss ='categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = seq2seq_model_builder(HIDDEN_DIM=300)
model.summary()

## Step 4. Training Model

In [None]:
BATCH_SIZE = 32
EPOCHS = 5

In [None]:
history = model.fit([encoder_input_data, decoder_input_data],
                     decoder_output_data,
                     epochs=EPOCHS, 
                     batch_size=BATCH_SIZE)

In [None]:
with open('seq2seq.json',"w").write(model.to_json())

In [None]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
model.save_weights("chatbot_model.h5")