https://fasttext.cc/docs/en/aligned-vectors.html  
Aligning the fastText vectors of 78 languages https://github.com/babylonhealth/fastText_multilingual

In [None]:
python_files_dir = './Python Codes'
import os
os.chdir(python_files_dir)

In [None]:
import pandas as pd
import numpy as np

import fasttext
import gensim
import nltk

import Import_and_clean_data as ic
import WCS_Clustering_MainFunction as cmf
import SlimWeightedVecData as swd
import OutputWarranty as ow
import WarrantyCluster as wc
import PreprocessText as pt

In [None]:
#!pip install seaborn

In [None]:
from fasttext import FastVector

In [None]:
import numpy as np


class FastVector:
    """
    Minimal wrapper for fastvector embeddings.
    ```
    Usage:
        $ model = FastVector(vector_file='/path/to/wiki.en.vec')
        $ 'apple' in model
        > TRUE
        $ model['apple'].shape
        > (300,)
    ```
    """

    def __init__(self, vector_file='', transform=None):
        """Read in word vectors in fasttext format"""
        self.word2id = {}

        # Captures word order, only used for export(), so that more frequent words are earlier in the file
        self.id2word = []

        print('reading word vectors from %s' % vector_file)
        with open(vector_file, 'r') as f:
            (self.n_words, self.n_dim) = \
                (int(x) for x in f.readline().rstrip('\n').split(' '))
            self.embed = np.zeros((self.n_words, self.n_dim))
            for i, line in enumerate(f):
                elems = line.rstrip('\n').split(' ')
                self.word2id[elems[0]] = i
                self.embed[i] = elems[1:self.n_dim+1]
                self.id2word.append(elems[0])

        if transform is not None:
            print('Applying transformation to embedding')
            self.apply_transform(transform)

    def apply_transform(self, transform):
        """
        Apply the given transformation to the vector space

        Right-multiplies given transform with embeddings E:
            E = E * transform

        Transform can either be a string with a filename to a
        text file containing a ndarray (compat. with np.loadtxt)
        or a numpy ndarray.
        """
        transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
        self.embed = np.matmul(self.embed, transmat)

    def export(self, outpath):
        """
        Transforming a large matrix of WordVectors is expensive. 
        This method lets you write the transformed matrix back to a file for future use
        :param The path to the output file to be written 
        """
        fout = open(outpath, "w")

        # Header takes the guesswork out of loading by recording how many lines, vector dims
        fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
        for token in self.id2word:
            vector_components = ["%.6f" % number for number in self[token]]
            vector_as_string = " ".join(vector_components)

            out_line = token + " " + vector_as_string + "\n"
            fout.write(out_line)

        fout.close()


    @classmethod
    def cosine_similarity(cls, vec_a, vec_b):
        """Compute cosine similarity between vec_a and vec_b"""
        return np.dot(vec_a, vec_b) / \
            (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

    def __contains__(self, key):
        return key in self.word2id

    def __getitem__(self, key):
        return self.embed[self.word2id[key]]

In [None]:
# showing multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# compute instance = data-sci-experiment
df_path = '~/cloudfiles/code/Users/cla.Min.Liu/OTU_SAP/Data/df_ny_gl_system_1_azure.csv'
stopword_path = "~/cloudfiles/code/Users/cla.Min.Liu/OTU_SAP/Data/danska-stopwords.csv"
#fasttext_link = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/data-sci-experiment/code/Users/cla.Min.Liu/cc.da.300.bin'
word_vec_da = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/data-sci-experiment/code/Users/cla.Min.Liu/wiki.da.align.vec'
word_vec_sv = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/data-sci-experiment/code/Users/cla.Min.Liu/wiki.sv.align.vec'
#fasttext_link = "wiki.da.align.vec"

In [None]:
# Loading 200000 most common words for English (limit for loading time)
model = gensim.models.KeyedVectors.load_word2vec_format(word_vec_da)

In [None]:
model

In [None]:
#loading embeddings
da_dictionary = FastVector(vector_file = word_vec_da)
sv_dictionary = FastVector(vector_file = word_vec_sv)

In [None]:
word_1 = da_dictionary["køre"]
word_2 = sv_dictionary["kör"]
word_1.shape
word_2.shape
FastVector.cosine_similarity(word_1, word_2)

In [None]:
def cos_sim_align(emb1, emb2, w1, w2):
    w_da = emb1[w1]
    w_es = smb2[w2]
    print(FastVector.cosine_similarity(w_en, w_es))

In [None]:
df = ic.ImportAndCleanCSV(df_path, datenc = "utf-8", text_multi_var = True, text_var = "Beskrivelse")
df.shape
df.head()

In [None]:
#model.get_word_vector("slitage")   #fasttext
#model['slitage']  # aligned model
model.vectors.shape

Nearest neighbor queries  

A simple way to check the quality of a word vector is to look at its nearest neighbors. This give an intuition of the type of semantic information the vectors are able to capture.

In [None]:
#model.get_nearest_neighbors('elg')
model.most_similar('körd')  # kørt 

In [None]:
df = ic.ImportAndCleanCSV(df_path, datenc = "utf-8", text_multi_var = 2, text_var = "Beskrivelse")
df.shape
df.head()

In [None]:
from nltk.corpus import stopwords
import itertools
def PreprocessData(dat, textcol, stopwordpath, stopenc):

    # Import stopwords
    stop_words = pd.read_csv(stopwordpath, encoding = stopenc, sep = " ")
    stopwords_custom = stop_words['stoppord'].values.tolist()
  
    stopwords_swe = stopwords.words('swedish')
    stopwords_dan = stopwords.words('danish')

    stop_words = itertools.chain(stopwords_swe, stopwords_dan, stopwords_custom)
    stop_words = list(stop_words)

    # Process text
    dat['Text'] = [pt.PreprocessText(t, stop_words) for t in dat[textcol]]

    return dat

df = PreprocessData(df, textcol = 'Text', stopwordpath = stopword_path, stopenc = "ISO 8859-1")

In [None]:
df.shape
df.head()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import fasttext
import numpy as np

def CalculateWeightedSentenceVector(text, fasttextmodel):
    response, words = TF_Idf_WeightMatrix(text)
    embmat = EmbeddingMatrix(words, fasttextmodel)

    w_sentence_vec = response*embmat
    return w_sentence_vec


def CreateCorpus(textcol):
    corpus = textcol.tolist()
    return corpus

def EmbeddingMatrix(words, fasttextmodel):
    return np.array([fasttextmodel.vectors(w) for w in words])


def TF_Idf_WeightMatrix(text):
    corpus = CreateCorpus(text)
    vectorizer = TfidfVectorizer()
    response = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names()

    return response, words


In [None]:
words
#test = CalculateWeightedSentenceVector(df['Text'], fasttextmodel = model)

In [None]:
import pandas as pd
import PreprocessText as pt
import WCS_tfidf as tfidf
import fasttext
from nltk.corpus import stopwords
import itertools

stop_words = pd.read_csv(stopword_path, encoding = "ISO 8859-1", sep = " ")
stopwords_custom = stop_words['stoppord'].values.tolist()
stopwords_swe = stopwords.words('swedish')
stopwords_dan = stopwords.words('danish')

stop_words = itertools.chain(stopwords_swe, stopwords_dan, stopwords_custom)
stop_words = list(stop_words)

# Process text
df['Text'] = [pt.PreprocessText(t, stop_words) for t in df["Text"]]

response, words = TF_Idf_WeightMatrix(text = df['Text'])

In [None]:
embmat = model.vectors
response.shape
#w_sentence_vec = response*embmat

In [None]:
import pandas as pd
import PreprocessText as pt
import WCS_tfidf as tfidf
import fasttext
from nltk.corpus import stopwords
import itertools
import gensim

def WeightedSentenceVector(dat, textcol, stopwordpath, stopenc, fasttextmodel):
    # Returns a weighted sentence vector

    # Fasttextmodel
    #model = fasttext.load_model(fasttextmodel)
    model = gensim.models.KeyedVectors.load_word2vec_format(fasttextmodel)
    
    # Import stopwords
    stop_words = pd.read_csv(stopwordpath, encoding = stopenc, sep = " ")
    stopwords_custom = stop_words['stoppord'].values.tolist()
  
    stopwords_swe = stopwords.words('swedish')
    stopwords_dan = stopwords.words('danish')

    stop_words = itertools.chain(stopwords_swe, stopwords_dan, stopwords_custom)
    stop_words = list(stop_words)

    # Process text
    dat['Text'] = [pt.PreprocessText(t, stop_words) for t in dat[textcol]]

     # Create weighted word vector
    WeightVec = CalculateWeightedSentenceVector(text = dat['Text'],
                                                      fasttextmodel = model)

    dat['WeightVec'] = [r for r in WeightVec]

    return dat


In [None]:
mydat = WeightedSentenceVector(df, textcol = "Text", stopwordpath = stopword_path, 
                               stopenc = "ISO 8859-1",
                               fasttextmodel = fasttext_link)
mydat

## Obtaining data for clustering

The function DataForClustering is applied to the subset of the data (in the following example, the subset of the data is the one which system = 1.   

The output of the function returns weighted sentence vectors, and index created for each row.

In [None]:
df_nlp = swd.DataForClustering(dat = df, group = False, 
                                      sort_var_1 = 'System', sort_var_2 = 'Meddelelsesdato',
                                      textcol = 'Text',
                                      stopwordpath = stopword_path, stopenc = "ISO 8859-1",
                                      fasttext_link = fasttext_link)

In [None]:
df_nlp.head()
df_nlp['Text'][0]  # the first row

In [None]:
df_nlp_system.to_csv('C:/Users/LIUM3478/OneDrive Corp/OneDrive - Atkins Ltd/Work_Atkins/otu sap wcs 2020 12/processed_df.csv', 
              sep=';', encoding='ISO 8859-1')

## Cosine Similarity

In [None]:
# list(dat_group.Cluster.loc[dat_group['AntalSimInds'] >= numdefects]) this removes small clusters that lie within the bigger ones
sim_cluster_output = cmf.WCSClustering(df_nlp, fasttext_link,
                                       group = False, groupvar = 'System',
                                       unique_cluster = True, cos_similarity = 0.9,
                                       numdefects = 2)
sim_cluster_output.shape

In [None]:
import numpy as np
import pandas
from itertools import compress, chain
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from pandas.testing import assert_frame_equal

def OutputWarranty(dat, wordvec, textvec, simil, group, groupvar, timevar):
    if group:
        allout = dat.groupby(groupvar).apply(func = lambda s:
                 FindSimsMatrix(s, similarity = simil, wordvec = wordvec, textvec = textvec,
                                groupvar = groupvar, timevar = timevar))
    else:
        allout = FindSimsMatrix(dat, similarity = simil, wordvec = wordvec, textvec = textvec,
                                groupvar = groupvar, timevar = timevar)

    # Select only defects with <timethreshold> time inbetween
    #FindWarranty(d = allout, timevar = timevar)
    allout = SelectWarrantyRows(allout)

    # Output: Select time period for output
    ints = FindIndices(allout, simind = 'SimInd')
    #allout = allout.loc[ints,:] # deprecated method
    #allout = allout[allout['Index'].isin(ints)]
    allout = allout.loc[allout.index.intersection(ints)] # chosen method
    #allout['SimScore'] = [[round(elem, 2) for elem in l] for l in allout['SimScore']]
    
    allout['AntalSimInds'] = [len(l) for l in allout['SimInd']]

    return allout

def FindSimsMatrix(g, similarity, wordvec, textvec, groupvar, timevar):
    vec = g[wordvec].tolist()
    sim = CompareBackwardsMatrix(vec)
    #simindex = [g['Index'][np.where(sim[row] > similarity)[0]].tolist() for row in range(sim.shape[0])]
    #simindex = [g['Index'][FindSimIndex(row, similarity)].tolist() for row in sim]
    simindex = [g['Index'][row > similarity].tolist() for row in sim]
    simval = [FindSimValue(row, similarity) for row in sim]
    return pandas.DataFrame({'SimScore': simval,
                             'SimInd': simindex,
                             textvec: g[textvec],
                             wordvec: g[wordvec],
                             'Index': g['Index'],
                             groupvar: g[groupvar],
                             timevar: g[timevar],
                             'Meddelelse': g['Meddelelse']})

# vec = g[wordvec].tolist() where g = data, wordvec = 'WeightVec'
def CompareBackwardsMatrix(vec, metric='cosine'):
    """
    It should compare backwards:
    >>> vec = [[1, 1, 1, 1], [1, 1, 1, 1]]
    >>> res = CompareBackwardsMatrix(vec, metric='cosine')
    >>> res
    array([[0., 0.],
           [1., 0.]])
    """
    m = np.matrix(vec)
    d = 1 - pairwise_distances(m, metric=metric)
    return np.tril(d, -1)

def SelectWarrantyRows(d):
    keep = d.SimInd.astype(bool)
    # for i in range(len(d)):
    #     for s in d.SimInd[i]:
    #         keep[s] = True
    for inds in d.SimInd:
        for s in inds:
            keep[s] = True
    return d[keep]

def FindSimValue(sim, similarity):
    index = sim > similarity
    return sim[index]

def FindSimIndex(sim, similarity):
    return np.where(sim > similarity)[0]

def FindIndices(d, simind):
    #ints1 = [i for inds in d[simind] for i in inds]
    ints1 = d[simind].values.tolist()
    ints1 = list(chain.from_iterable(ints1))
    ints2 = d['Index'].values.tolist()
    ints = [*ints1, *ints2]
    ints = list(set(ints))
    ints.sort()
    return ints

In [None]:
cos_similarity = 0.8
similarity_output = OutputWarranty(dat = df_nlp_system,
                                  wordvec = 'WeightVec',
                                 textvec = 'Text', simil = cos_similarity,
                                 group = True, groupvar = 'System',
                                 timevar = 'Meddelelsesdato')
similarity_output.shape
similarity_output

In [None]:
out_fleet = pd.merge(similarity_output, df_nlp_system[["Index", "Tågset", "Vagn", 
                                            'System_text', 'Systemstatus', 'Beskrivelse']],
                                             on="Index", how='left')
out_fleet['AntalSimInds'] = [len(l) for l in out_fleet['SimInd']]
out_fleet

In [None]:
from itertools import compress, chain

def WarrantyCluster(dat):
    siminds = dat.SimInd.tolist()
    inds = dat.Index.values.tolist()

    Cluster = AddCluster(inds, siminds)
    dat['Cluster'] = Cluster

    return dat

def AddCluster(inds, siminds):
    midres = MiddleCluster(inds, siminds)
    next = NextStep(midres, inds)
    clusters = AddItself(next, inds)
    return clusters

def MiddleCluster(inds, siminds):
    # This function checks which clusters each ind belongs to
    midclusters = []
    for i in range(len(inds)):
        midclusters.append([])
        for sim in siminds[i]:
            midclusters[inds.index(sim)].append(inds[i])
    return midclusters

def NextStep(midres, inds):
    # This function removes all inds which are not clusters,
    # because they themselves belong to another cluster
    for i in reversed(range(len(inds))):
        if len(midres[i]) > 0: # For non-empty clusterlists
            for j in range(i):
                if inds[i] in midres[j]: # If the index exists in clusterlist
                    if len(midres[j]) > 1: # If there are more than one cluster in clusterlist
                        midres[j].remove(inds[i]) # Remove index i from midres j
    return midres

def AddItself(next, inds):
    # This function adds the index itself as a cluster,
    # if the index exists in the list of clusters (next)
    cl = set(chain(*next))
    for i in cl:
        next[inds.index(i)].append(i)
    return next

def test_itShouldAddClusterToEachSimInd():
    inds =    [8 , 9, 11  , 13 , 14    , 22       , 25 , 33]
    siminds = [[],[],[8,9],[11],[11,13],[11,13,14],[14],[14,25]]

    midres = MiddleCluster(inds, siminds)
    silver = [[11],[11],[13,14,22],[14,22],[22,25,33],[]  ,[33],[]]
    assert(midres == silver)

    next = NextStep(midres, inds)
    nextgold = [[11],[11],[22]   ,[22]   ,[22,33]   ,[],[33],[]]
    assert(next == nextgold)

    res = AddItself(next, inds)
    gold = [[11],[11],[22, 11],[22],[22,33],[22],[33],[33]]
    assert(res == gold)

def FindClusters(clusters):
    inds = list(set(chain(clusters)))
    inds.sort()

    return inds


def UnstackListColum(df, lst_col):
    unstack_df = pd.DataFrame({
                        col:np.repeat(df[col].values, df[lst_col].str.len())
                        for col in df.columns.difference([lst_col])
                        }).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns.tolist()]
    return unstack_df


In [None]:
out_fleet_cluster = wc.WarrantyCluster(out_fleet) #From WarrantyCluster
out_fleet_cluster

In [None]:
# Unstack clusters
def UnstackListColum(df, lst_col):
    unstack_df = pd.DataFrame({
                        col:np.repeat(df[col].values, df[lst_col].str.len())
                        for col in df.columns.difference([lst_col])
                        }).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns.tolist()]
    return unstack_df

unst = UnstackListColum(df = out_fleet_cluster, lst_col = 'Cluster')
len(unst)
unst

In [None]:
# Find all rows in example cluster
dat_group = unst.groupby(["Cluster"])["AntalSimInds"].count().reset_index()

# Select all clusters where AntalSimInds >= 3
numdefects = 1
clustersmall = list(dat_group.Cluster.loc[dat_group['AntalSimInds'] >= numdefects])
small = unst[unst['Cluster'].isin(clustersmall)]
    
len(small)
small

In [None]:
len(sim_cluster_output)
sim_cluster_output.sort_values(by=['Counts','Cluster'], ascending=False, inplace=True)
sim_cluster_output

In [None]:
sim_cluster_output['Cluster'].unique().tolist()

In [None]:
#sim_cluster_output.to_csv('C:/Users/LIUM3478/OneDrive Corp/OneDrive - Atkins Ltd/Work_Atkins/otu sap wcs 2020 12/nlp/output/cosine_sim_cluster_output_1.csv',
#                         sep = ";", header = True, encoding = "ISO 8859-1")

sim_cluster_output.to_csv('C:/Users/LIUM3478/OneDrive Corp/OneDrive - Atkins Ltd/Work_Atkins/otu sap wcs 2020 12/nlp/output/cosine_sim_cluster_output_2.csv',
                         sep = ";", decimal = ",", header = True, index = False)

In [None]:
# checking what the clusters are
sim_cluster_output.groupby('Cluster')['Cluster'].nunique()

In [None]:
# checking total number of rows in one cluster
pd.Series(sim_cluster_output.Cluster).value_counts()

In [None]:
# Output only last defect per cluster
sim_cluster_output_last = sim_cluster_output.loc[sim_cluster_output.groupby('Cluster').Meddelelsesdato.idxmax()]
sim_cluster_output_last.sort_values(by=['Counts'], ascending=False, inplace=True)
len(sim_cluster_output_last)
sim_cluster_output_last

In [None]:
# listing the elements in one cluster
text_sim_id_dic = dict(zip(similarity_output.Index, similarity_output.Text))
#text_sim_id_dic
keys = similarity_output.SimInd[7135]
for key in keys:
    text_sim_id_dic.get(key)
#text_sim_id_dic[(111, 116)]

## Finding Optimal Clusters
Clustering is an unsupervised operation, and KMeans requires that we specify the number of clusters. 

One simple approach is to plot the SSE for a range of cluster sizes. We look for the "elbow" where the SSE begins to level off. MiniBatchKMeans introduces some noise so I raised the batch and init sizes higher. Unfortunately the regular Kmeans implementation is too slow. You'll notice different random states will generate different charts. Here I chose 14 clusters.

In [None]:
def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    
find_optimal_clusters(w, 20)

In [None]:
assigned_clusters = MiniBatchKMeans(n_clusters=18, init_size=1024, batch_size=2048, random_state=20).fit_predict(w)

In [None]:
assigned_clusters = ClusterWordVec(w, num_clusts = 10, n_init = 20, random_state = 100)

In [None]:
df_1['Cluster'] = assigned_clusters
df_1.groupby('Cluster').first()

In [None]:
cluster_wordfreq = WordFreqInClusters(df = df_1, num_top = 5)

In [None]:
print(cluster_wordfreq)

In [None]:
test = df_1[df_1['Cluster'].values == 1]
test['System'].unique()
test