# Vectorization Methods Comparison


Tested Methods:
* Binary BoW
* BoW
* TF-IDF
* Word2Vec pretrained
* Word2vec trained from scratch
* Doc2Vec



## Imports

Models libraries:
*   gensim - 3.6.0
*   nltk - 3.2.5
*   sklearn



In [1]:
# Basic imports
import pandas as pd
import numpy as np
import time

# Preprocessing
import nltk
from nltk.corpus import stopwords

# Embeddings
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

# Cluster
from sklearn.cluster import KMeans

# Cluster evaluation
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score

# Visualization
import matplotlib.pyplot as plt

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Dataset

Corpus of judicial moves on TJRN (Tribunal de Justiça do Rio Grande do Norte)


**Metadata**
* Language - PT-BR
* Size - 30.000 documents
* 10 classes


In [9]:
!unzip tjrn_10_classes_pre.zip

Archive:  tjrn_10_classes_pre.zip
  inflating: 10_classes_preprocessado.csv  


In [10]:
data = pd.read_csv("10_classes_preprocessado.csv")

In [11]:
data.head()

Unnamed: 0,data,label
0,poder judiciário estado do rio grande do n...,196
1,poder judiciário do estado do rio grande do no...,196
2,poder judiciário do estado do rio grande do no...,196
3,poder judiciário do estado do rio grande do no...,196
4,processo n autor arnilton josino de ol...,196


In [12]:
texts = data['data']
labels = data['label']

texts = texts.astype(str)

## Preprocessing


*   Lowercase
*   Remove puctuation
*   Remove special characters and numbers
*   Stopwords removal (NLTK)


In [13]:
tokenizer = CountVectorizer( stop_words=stopwords.words('portuguese') ).build_analyzer()

def remove_stopwords(text):
  tokens = " ".join( tokenizer(text) )
  return tokens


In [14]:
%%time
texts = texts.apply(remove_stopwords)

CPU times: user 7.38 s, sys: 71.9 ms, total: 7.45 s
Wall time: 7.48 s


In [15]:
texts.head()

0    poder judiciário estado rio grande norte comar...
1    poder judiciário estado rio grande norte juiza...
2    poder judiciário estado rio grande norte rua d...
3    poder judiciário estado rio grande norte ac un...
4    processo autor arnilton josino oliveira réu un...
Name: data, dtype: object

## Building Clustering models

**Test multiple number of clusters**

In [16]:
no_clusters = [2, 4, 8, 16, 32, 64, 128, 256]
kmean_clts = [ KMeans(n_clusters=n, random_state=214, n_init=5) for n in no_clusters ]

**Test no. clusters matching no. classes**

In [17]:
kmean_10_clt = KMeans(n_clusters=10, random_state=214)
kmean_clts = [kmean_10_clt]

## Training embedding models

In [18]:
def evaluate_sklearn_model(model, texts, cmodels, true_labels):
  results = { "NMI":[], "RI":[] }

  # Fitting
  print("Fitting model...")
  t1 = time.time()
  X = model.fit_transform(texts).todense()
  train_time=time.time()-t1
  print(f"Fit time - {train_time}")

  for i, cmodel in enumerate(cmodels):
    # Cluster Train/Prediction
    print(f"Fitting Cluster model {i+1}/{len(cmodels)}")
    pred_labels = cmodel.fit_predict(X)
    
    results["NMI"].append( adjusted_mutual_info_score(true_labels, pred_labels) )
    results["RI"].append( adjusted_rand_score(true_labels, pred_labels) )

  return results

In [19]:
def evaluate_X_transformation(X, cmodels, true_labels):
  results = { "NMI":[], "RI":[] }
  
  for i, cmodel in enumerate(cmodels):
    # Cluster Train/Prediction
    print(f"Fitting Cluster model {i+1}/{len(cmodels)}")
    pred_labels = cmodel.fit_predict(X)
    
    results["NMI"].append( adjusted_mutual_info_score(true_labels, pred_labels) )
    results["RI"].append( adjusted_rand_score(true_labels, pred_labels) )

  return results

### Binary BoW

*   max_features - 50, 100, 300, 600, 1000
* n_grams - unigrams, bigrams



In [27]:
bow_params = ParameterGrid( {
                            "max_features":[50,100,300,600,1000],
                            "ngram_range":[(1,1), (1,2)]
                            } )

In [28]:
bbow_parameters = [{'dim':dic['max_features'], 
                    'mod':'unigram' if dic['ngram_range']==(1,1) else 'bigram'} 
                    for dic in list(bow_params)]
dimentions = bbow_parameters.copy()
dimentions.extend(bbow_parameters)
dimentions.extend(bbow_parameters)

In [None]:
bbow_models = [ CountVectorizer(binary=True).set_params(**params) for params in bow_params ]
bbow_results = []

for model in bbow_models:
  bbow_results.append( evaluate_sklearn_model(model, texts, kmean_clts, labels) )

In [22]:
bbow_results_df = pd.DataFrame(bbow_results)
bbow_results_df['name'] = 'bbow'

### BoW

*   max_features - 50, 100, 300, 600, 1000
* n_grams - (1,1), (1,2)



In [23]:
bow_models = [ CountVectorizer().set_params(**params) for params in bow_params ]
bow_results = []

for model in bow_models:
  bow_results.append( evaluate_sklearn_model(model, texts, kmean_clts, labels) )

Fitting model...
Fit time - 8.586455345153809
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.80719518661499
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.662347555160522
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.420992136001587
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.666850090026855
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.970436096191406
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.56110167503357
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.32029700279236
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.657768249511719
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.135919094085693
Fitting Cluster model 1/1


In [24]:
bow_results_df = pd.DataFrame(bow_results)
bow_results_df['name'] = 'bow'
bow_results_df.head()

Unnamed: 0,NMI,RI,name
0,[0.25775476416854404],[0.10741762693199527],bow
1,[0.26928727121742013],[0.10939811302776066],bow
2,[0.32517482271543274],[0.1302201323377686],bow
3,[0.27370501441169404],[0.10409786328373496],bow
4,[0.3277500948815257],[0.1383141419348717],bow


### TF-IDF

*   max_features - 50, 100, 300, 600, 1000
* n_grams - (1,1), (1,2)



In [25]:
tfidf_models = [ TfidfVectorizer().set_params(**params) for params in bow_params ]
tfidf_results = []

for model in tfidf_models:
  tfidf_results.append( evaluate_sklearn_model(model, texts, kmean_clts, labels) )

Fitting model...
Fit time - 8.52454423904419
Fitting Cluster model 1/1
Fitting model...
Fit time - 25.979973554611206
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.47610354423523
Fitting Cluster model 1/1
Fitting model...
Fit time - 25.699411392211914
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.52983021736145
Fitting Cluster model 1/1
Fitting model...
Fit time - 25.956193685531616
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.556832313537598
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.24940276145935
Fitting Cluster model 1/1
Fitting model...
Fit time - 8.601106882095337
Fitting Cluster model 1/1
Fitting model...
Fit time - 26.180219173431396
Fitting Cluster model 1/1


In [26]:
tfidf_results_df = pd.DataFrame(tfidf_results)
tfidf_results_df['name'] = 'tfidf'
tfidf_results_df.head()

Unnamed: 0,NMI,RI,name
0,[0.4334156225172421],[0.29408031288704173],tfidf
1,[0.38363052616975146],[0.25121024557296817],tfidf
2,[0.5213213455536525],[0.38752648012849883],tfidf
3,[0.46539795525483224],[0.3409916171919743],tfidf
4,[0.579513290330075],[0.4470603783921699],tfidf


### Word2vec

*   dim - 50, 100, 300, 600, 1000
* pretrained - true, false

**Pretrained**

Downloading Word2vec Skip-gram pretrained from NILC repository

In [None]:
!wget http://143.107.183.175:22980/download.php?file=embeddings/word2vec/skip_s50.zip
!wget http://143.107.183.175:22980/download.php?file=embeddings/word2vec/skip_s100.zip
!wget http://143.107.183.175:22980/download.php?file=embeddings/word2vec/skip_s300.zip
!wget http://143.107.183.175:22980/download.php?file=embeddings/word2vec/skip_s600.zip
!wget http://143.107.183.175:22980/download.php?file=embeddings/word2vec/skip_s1000.zip

In [34]:
!unzip download.php?file=embeddings%2Fword2vec%2Fskip_s50.zip
!unzip download.php?file=embeddings%2Fword2vec%2Fskip_s100.zip
!unzip download.php?file=embeddings%2Fword2vec%2Fskip_s300.zip
!unzip download.php?file=embeddings%2Fword2vec%2Fskip_s600.zip
!unzip download.php?file=embeddings%2Fword2vec%2Fskip_s1000.zip

Archive:  download.php?file=embeddings%2Fword2vec%2Fskip_s50.zip
replace skip_s50.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: Archive:  download.php?file=embeddings%2Fword2vec%2Fskip_s100.zip
  inflating: skip_s100.txt           
Archive:  download.php?file=embeddings%2Fword2vec%2Fskip_s300.zip
  inflating: skip_s300.txt           
Archive:  download.php?file=embeddings%2Fword2vec%2Fskip_s600.zip
  inflating: skip_s600.txt           
Archive:  download.php?file=embeddings%2Fword2vec%2Fskip_s1000.zip
  inflating: skip_s1000.txt          


Saving dimentions

In [None]:
w2v_dimentions = [ {'dim':d, 'mod':'pre'} for d in [50, 100, 300, 600, 1000] ]
dimentions.extend( w2v_dimentions )

Splitting texts into tokens

In [35]:
text_tokens = texts.apply( lambda x: x.split(' ') )

In [36]:
def transform_tokens_w2v(tokens, kv, strategy="mean"):
  if strategy=="max":
    X = tokens.apply( lambda l: np.max([w2v[w] for w in l if w in w2v], axis=0) )
  elif strategy=="mean":
    X = tokens.apply( lambda l: np.mean([w2v[w] for w in l if w in w2v], axis=0) )

  return np.stack(X)

In [37]:
w2vp_results = []

for dim in [50, 100, 300, 600, 1000]:
  print(f"Loading vectors dim-{dim}")
  w2v = KeyedVectors.load_word2vec_format(f'skip_s{dim}.txt')
  print(f"Finished vectors loading")

  X = transform_tokens_w2v(text_tokens, w2v)
  w2vp_results.append( evaluate_X_transformation(X, kmean_clts, labels) )

Loading vectors dim-50
Finished vectors loading
Fitting Cluster model 1/1
Loading vectors dim-100
Finished vectors loading
Fitting Cluster model 1/1
Loading vectors dim-300
Finished vectors loading
Fitting Cluster model 1/1
Loading vectors dim-600
Finished vectors loading
Fitting Cluster model 1/1
Loading vectors dim-1000
Finished vectors loading
Fitting Cluster model 1/1


In [38]:
w2vp_results_df = pd.DataFrame(w2vp_results)
w2vp_results_df['name'] = 'w2v-pre'
w2vp_results_df.head()

Unnamed: 0,NMI,RI,name
0,[0.2883190394689533],[0.1765721642111057],w2v-pre
1,[0.3027543922479643],[0.19242895048414949],w2v-pre
2,[0.3758918971978455],[0.24524458924242348],w2v-pre
3,[0.38406380576980553],[0.25018707741496504],w2v-pre
4,[0.3974927073874316],[0.2629978806093503],w2v-pre


**Trained on dataset**

In [30]:
w2vt_dimentions = [ {'dim':d, 'mod':'train'} for d in [50, 100, 300, 600, 1000] ]
dimentions.extend( w2vt_dimentions )

In [39]:
w2vt_results = []

for dim in [50, 100, 300, 600, 1000]:
  print(f"Training vectors dim-{dim}")
  w2v = Word2Vec(sentences=text_tokens, size=dim, sg=1, iter=5, workers=4).wv
  print(f"Finished vectors training")

  X = transform_tokens_w2v(text_tokens, w2v)
  w2vt_results.append( evaluate_X_transformation(X, kmean_clts, labels) )

Training vectors dim-50
Finished vectors training
Fitting Cluster model 1/1
Training vectors dim-100
Finished vectors training
Fitting Cluster model 1/1
Training vectors dim-300
Finished vectors training
Fitting Cluster model 1/1
Training vectors dim-600
Finished vectors training
Fitting Cluster model 1/1
Training vectors dim-1000
Finished vectors training
Fitting Cluster model 1/1


In [40]:
w2vt_results_df = pd.DataFrame(w2vt_results)
w2vt_results_df['name'] = 'w2v-train'
w2vt_results_df.head()

Unnamed: 0,NMI,RI,name
0,[0.36952356965827043],[0.23796693882956388],w2v-train
1,[0.4044073062848645],[0.2629408788812099],w2v-train
2,[0.3826509775486679],[0.2239712350304055],w2v-train
3,[0.42158241634097154],[0.26959954645285067],w2v-train
4,[0.4173394101817505],[0.27037953076979004],w2v-train


In [41]:
w2vt_results_df.to_csv('w2v_train.csv')

### Doc2vec

* dim - 50, 100, 300, 600, 1000

In [31]:
d2v_dimentions = [ {'dim':d, 'mod':'-'} for d in [50, 100, 300, 600, 1000] ]
dimentions.extend( d2v_dimentions )

In [42]:
tag_text_tokens = [TaggedDocument(text, [i]) for i, text in enumerate(text_tokens)]

In [43]:
d2v_results = []

for dim in [50, 100, 300, 600, 1000]:
  print(f"Training doc vectors dim-{dim}")
  d2v = Doc2Vec(documents=tag_text_tokens, vector_size=dim, epochs=5, workers=4)
  print(f"Finished doc vectors training")

  X = np.array([ d2v.docvecs[i] for i in range(len(tag_text_tokens)) ])
  d2v_results.append( evaluate_X_transformation(X, kmean_clts, labels) )

Training doc vectors dim-50
Finished doc vectors training
Fitting Cluster model 1/1
Training doc vectors dim-100
Finished doc vectors training
Fitting Cluster model 1/1
Training doc vectors dim-300
Finished doc vectors training
Fitting Cluster model 1/1
Training doc vectors dim-600
Finished doc vectors training
Fitting Cluster model 1/1
Training doc vectors dim-1000
Finished doc vectors training
Fitting Cluster model 1/1


In [44]:
d2v_results_df = pd.DataFrame(d2v_results)
d2v_results_df['name'] = 'd2v'
d2v_results_df.head()

Unnamed: 0,NMI,RI,name
0,[0.20374729350871496],[0.09299767058451501],d2v
1,[0.19522050204517646],[0.09035600966352729],d2v
2,[0.18971318070448337],[0.0849664399070562],d2v
3,[0.185029428883628],[0.08394909917200757],d2v
4,[0.18829321553236383],[0.08499762868504115],d2v


## Results consolidation

**Saving dimensions dataset**

In [35]:
dimentions_df = pd.DataFrame( dimentions )
dimentions_df.to_csv('dimentions_df.csv', index=False)

**Saving final results**

In [46]:
final_results = pd.concat( [
                            bbow_results_df,
                            bow_results_df,
                            tfidf_results_df,
                            w2vp_results_df,
                            w2vt_results_df,
                            d2v_results_df
                            ]
                          )

In [48]:
final_results.to_csv('10_clusters_tests_result.csv', index=False)