# DATA PREPARATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from label_cleaning import label_cleaning
from tqdm import tqdm
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
warnings.filterwarnings('ignore')

In [4]:
%%time 
df = pd.read_pickle('../data/doc_vectors.pkl')

CPU times: user 19.5 s, sys: 29.5 s, total: 49 s
Wall time: 1min


In [None]:
%%time
import pyLDAvis
import pyLDAvis.gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel

### Drop duplicated and null texts

In [5]:
df.drop_duplicates(subset = 'CONTENU', inplace = True)
df = df[df.CONTENU != 'null']

### Reduction of label number

In [None]:
df.main_labels.nunique()

In [None]:
for i in range(len(df)):
    if df.main_labels.iloc[i] in label_cleaning.keys():
        df.main_labels.iloc[i] = label_cleaning[df.main_labels.iloc[i]]

In [None]:
df.main_labels.nunique()

In [None]:
df.SCT[3]

### Add start token

In [None]:
for i in tqdm(range(len(df))):
    df.tokens_rest.iloc[i] = np.insert(df.tokens_rest.iloc[i], 0, ['<START>'])

### Conserve non-null labels data

In [None]:
data = df[df.main_labels.notnull()]

In [None]:
sns.displot()

In [None]:
corpus = data.tokens_rest32
vectorizer = TfidfVectorizer(lowercase=False, analyzer=lambda x: x)

tf_idf = vectorizer.fit_transform(corpus)
tf_idf_sklearn = Sparse2Corpus(tf_idf, documents_columns=False)
id2word = Dictionary(corpus)
bow = [id2word.doc2bow(line) for line in corpus] 
lda2 = LdaModel(corpus=tf_idf_sklearn, id2word=id2word, num_topics=253, passes=10)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(topic_model=lda2, corpus=bow, dictionary=id2word)
vis

### Encoding labels

In [None]:
labels = LabelEncoder().fit_transform(data.main_labels)

### Encoding texts

In [None]:
def build_word_vector(tokens):
    corpus = list()
    for doc in tqdm(tokens):
        for word in list(set(doc)):
            corpus.append(word)
            
    corpus = list(set(corpus))
    word_index = {value:index for index, value in enumerate(corpus)}
    word_index_reverse = {index:value for index, value in enumerate(corpus)}
    
    token_vectors = [[word_index[token] for token in text] for text in tqdm(tokens)]
    
    return token_vectors, word_index, word_index_reverse

In [None]:
token_vectors, word_index, word_index_reverse = build_word_vector(data.tokens_rest)

In [None]:
data['token_vectors'] = token_vectors

In [None]:
f = open("word_index_reverse.py","w")
f.write('word_index_reverse = ' + str(word_index_reverse))
f.close()

In [None]:
f = open("word_index.py","w")
f.write('word_index = ' + str(word_index))
f.close()

### Train-test split

In [None]:
X = data[['ID', 'token_vectors']]
y = labels

In [None]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = labels)

In [None]:
print(f'Training entries: {X_train.shape}, labels: {y_train.shape}, \nTest entries: {X_test.shape}, labels: {y_test.shape}')

### Saving

In [None]:
filepath = '../train_test_sets/'

In [None]:
X_train.to_pickle(filepath + 'X_train.pkl')
X_test.to_pickle(filepath + 'X_test.pkl')

In [None]:
pickle.dump(y_train, open(filepath + 'y_train.pkl', "wb" ))
pickle.dump(y_train, open(filepath + 'y_test.pkl', "wb" ))

#### Similarity analysis

In [None]:
plt.figure()
sns.distplot(data.main_labels.value_counts(), hist = False)
plt.yticks([])
plt.title('Distribution of label frequencies', loc = 'left', fontweight = 'bold')
plt.xlabel(None)
plt.show()

In [None]:
data = df[df.main_labels.notnull()]
labels_value_counts = data.main_labels.value_counts()
quantiles = [0, 0.25, 0.5, 0.75, 1]

plt.figure(figsize = (12, 10))
for i in range(len(quantiles) - 1):
    
    quantile_down = data.main_labels.value_counts().quantile(quantiles[i])
    quantile_up = data.main_labels.value_counts().quantile(quantiles[i + 1])
    D = data[data.main_labels.isin(labels_value_counts[labels_value_counts > quantile_down][labels_value_counts <= quantile_up].index)]
    
    plt.subplot(2, 2, i + 1)
    plt.title(f'Number of decisions per labels between {int(quantile_down)} and {int(quantile_up)}', fontweight = 'bold')

    
    M_mean = list()
    M_median = list()
    for label in D.main_labels.unique():
        M = cosine_similarity(D.doc_vectors[D.main_labels == label].apply(pd.Series))
        M_values = M[np.tril_indices_from(M, 1)]
        M_mean.append(np.mean(M_values))
        
    sns.distplot(M_mean, hist = False)
        
    plt.xlabel(f'Mean: {np.mean(M_mean), 2}')
    plt.yticks([])

plt.suptitle('Distribution of cosine similarity', fontsize=16, fontweight = 'bold')
plt.show()