# DATA PREPARATION

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from label_cleaning import label_cleaning
from tqdm import tqdm
import pickle

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
warnings.filterwarnings('ignore')

In [4]:
%%time 
df = pd.read_pickle('../data/doc_vectors.pkl')

CPU times: user 17.2 s, sys: 15.4 s, total: 32.6 s
Wall time: 46.2 s


In [8]:
%%time
import pyLDAvis
import pyLDAvis.gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel

CPU times: user 47 µs, sys: 44 µs, total: 91 µs
Wall time: 124 µs


In [9]:
corpus = df.tokens_rest
vectorizer = TfidfVectorizer(lowercase=False, analyzer=lambda x: x)

tf_idf = vectorizer.fit_transform(corpus)
tf_idf_sklearn = Sparse2Corpus(tf_idf, documents_columns=False)

In [11]:
id2word = Dictionary(corpus)
bow = [id2word.doc2bow(line) for line in corpus] 

In [12]:
lda2 = LdaModel(corpus=tf_idf_sklearn, id2word=id2word, num_topics=5, passes=10)

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(topic_model=lda2, corpus=bow, dictionary=id2word)
vis

### Drop duplicated and null texts

In [5]:
df.drop_duplicates(subset = 'CONTENU', inplace = True)
df = df[df.CONTENU != 'null']

### Reduction of label number

In [6]:
df.main_labels.nunique()

311

In [7]:
for i in range(len(df)):
    if df.main_labels.iloc[i] in label_cleaning.keys():
        df.main_labels.iloc[i] = label_cleaning[df.main_labels.iloc[i]]

In [8]:
df.main_labels.nunique()

253

### Add start token

In [9]:
for i in tqdm(range(len(df))):
    df.tokens_rest.iloc[i] = np.insert(df.tokens_rest.iloc[i], 0, ['<START>'])

100%|██████████| 66277/66277 [02:28<00:00, 446.57it/s]


### Conserve non-null labels data

In [10]:
data = df[df.main_labels.notnull()]

### Encoding labels

In [11]:
labels = LabelEncoder().fit_transform(data.main_labels)

In [12]:
labels

array([ 75, 223, 191, ...,  34, 213, 187])

### Encoding texts

In [13]:
def build_word_vector(tokens):
    corpus = list()
    for doc in tqdm(tokens):
        for word in list(set(doc)):
            corpus.append(word)
            
    corpus = list(set(corpus))
    word_index = {value:index for index, value in enumerate(corpus)}
    word_index_reverse = {index:value for index, value in enumerate(corpus)}
    
    token_vectors = [[word_index[token] for token in text] for text in tqdm(tokens)]
    
    return token_vectors, word_index, word_index_reverse

In [14]:
token_vectors, word_index, word_index_reverse = build_word_vector(data.tokens_rest)

100%|██████████| 16066/16066 [00:07<00:00, 2083.71it/s]
100%|██████████| 16066/16066 [00:08<00:00, 1946.89it/s]


In [15]:
data['token_vectors'] = token_vectors

In [16]:
f = open("word_index_reverse.py","w")
f.write('word_index_reverse = ' + str(word_index_reverse))
f.close()

In [17]:
f = open("word_index.py","w")
f.write('word_index = ' + str(word_index))
f.close()

### Train-test split

In [18]:
X = data[['ID', 'token_vectors']]
y = labels

In [19]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = labels)

In [20]:
print(f'Training entries: {X_train.shape}, labels: {y_train.shape}, \nTest entries: {X_test.shape}, labels: {y_test.shape}')

Training entries: (14459, 2), labels: (14459,), 
Test entries: (1607, 2), labels: (1607,)


### Saving

In [21]:
filepath = '../train_test_sets/'

In [22]:
X_train.to_pickle(filepath + 'X_train.pkl')
X_test.to_pickle(filepath + 'X_test.pkl')

In [23]:
pickle.dump(y_train, open(filepath + 'y_train.pkl', "wb" ))
pickle.dump(y_train, open(filepath + 'y_test.pkl', "wb" ))

#### Similarity analysis

In [None]:
plt.figure()
sns.distplot(train.main_labels.value_counts(), hist = False)
plt.yticks([])
plt.title('Distribution of label frequencies', loc = 'left', fontweight = 'bold')
plt.xlabel(None)
plt.show()

In [None]:
data = df[df.main_labels.notnull()]
labels_value_counts = data.main_labels.value_counts()
quantiles = [0, 0.25, 0.5, 0.75, 1]

plt.figure(figsize = (12, 10))
for i in range(len(quantiles) - 1):
    
    quantile_down = data.main_labels.value_counts().quantile(quantiles[i])
    quantile_up = data.main_labels.value_counts().quantile(quantiles[i + 1])
    D = data[data.main_labels.isin(labels_value_counts[labels_value_counts > quantile_down][labels_value_counts <= quantile_up].index)]
    
    plt.subplot(2, 2, i + 1)
    plt.title(f'Number of decisions per labels between {int(quantile_down)} and {int(quantile_up)}', fontweight = 'bold')

    
    M_mean = list()
    M_median = list()
    for label in D.main_labels.unique():
        M = cosine_similarity(D.doc_vectors[D.main_labels == label].apply(pd.Series))
        M_values = M[np.tril_indices_from(M, 1)]
        M_mean.append(np.mean(M_values))
        
    sns.distplot(M_mean, hist = False)
        
    plt.xlabel(f'Mean: {np.mean(M_mean), 2}')
    plt.yticks([])

plt.suptitle('Distribution of cosine similarity', fontsize=16, fontweight = 'bold')
plt.show()