In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 21 16:45:19 2019

@author: Khushwant Rai
"""
from sklearn.manifold import TSNE

import numpy as np
from nltk.tokenize import sent_tokenize

import skipthought
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import pairwise_distances_argmin_min
from keras.layers import Dense, Input
from keras.models import Model
from scipy.sparse.csgraph import laplacian
import random
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from keras.regularizers import l1

class summarizer():
    tsne = TSNE(n_components=2, random_state=0)
    def preprocess(self,data):
        """
        some data prerocessing 
        """
        return data
            
            
    def split_sentences(self,data):
        sent=[]
        sentences = sent_tokenize(data)
        for j in range(len(sentences)):
            sentences[j] = sentences[j].strip()
            if sentences[j] != '':
                sent.append(sentences[j])
        return sent 
            
    def skipthought_encode(self,data):
        all_sentences = [sent for sent in data]
        print('Loading pre-trained models...')
        model = skipthought.load_model()
        encoder = skipthought.Encoder(model)
        print('Encoding sentences...')
        enc_sentences = encoder.encode(all_sentences, verbose=0)
        print(enc_sentences)
        return enc_sentences
            
        
    def summarize(self,data,x): 
        mean = []
        closest = []
        pdata=self.preprocess(data)
        split=self.split_sentences(pdata)
        print('tokeniztion done')
        vectors = self.skipthought_encode(split)
        print('vetorization done')
        n_clusters = int(x)
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        ts_vec=self.tsne.fit_transform(vectors)
        kmeans = kmeans.fit(ts_vec)
        for j in range(n_clusters):
            i = np.where(kmeans.labels_ == j)[0]
            mean.append(np.mean(i))
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,ts_vec)
        ordering = sorted(range(n_clusters), key=lambda k: mean[k])
        summary = ' '.join([split[closest[kr]] for kr in ordering])
        print('summary formed')
        return summary, kmeans, ts_vec, vectors
    
    
    
    def auto_encoder(self,enc):
        
        input_img = Input(shape=(4800,))
        encoded = Dense(1000, activation='relu', activity_regularizer=l1(10e-6))(input_img)
        encoded = Dense(200, activation='relu')(encoded)
        encoded = Dense(10, activation='sigmoid')(encoded)
        
        decoded = Dense(10, activation='relu')(encoded)
        decoded = Dense(200, activation='relu')(decoded)
        decoded = Dense(1000, activation='relu')(decoded)
        decoded = Dense(4800)(decoded)
        autoencoder = Model(input_img, decoded)
    
        encoder = Model(input_img, encoded)
        
        autoencoder.compile(optimizer='adam', loss='mse')
        
        train_x=enc
        
        train_x = train_x.reshape(-1, 4800).astype('float32')
        split_size = int(train_x.shape[0]*0.9)
        train_x, val_x = train_x[:split_size], train_x[split_size:]
        
        autoencoder.fit(train_x, train_x, epochs=500, batch_size=5, validation_data=(val_x, val_x))
        pred = encoder.predict(train_x)
        return pred
        
    def summarize_autoenc(self,data,vectors,x):
        mean = []
        closest = []
        pdata=self.preprocess(data)
        split=self.split_sentences(pdata)
        print('tokeniztion done')
        #vectors = self.skipthought_encode(split)
        print('vetorization done')
        n_clusters = int(x)
        latent=self.auto_encoder(vectors)
        kmeans2 = KMeans(n_clusters=n_clusters, random_state=0)
        ts_vec2=self.tsne.fit_transform(latent)
        kmeans2 = kmeans2.fit(ts_vec2)
        for j in range(n_clusters):
            i = np.where(kmeans2.labels_ == j)[0]
            mean.append(np.mean(i))
        closest, _ = pairwise_distances_argmin_min(kmeans2.cluster_centers_,ts_vec2)
        ordering = sorted(range(n_clusters), key=lambda k: mean[k])
        summary = ' '.join([split[closest[kr]] for kr in ordering])
        print('summary formed')
        return summary, kmeans2, ts_vec2
    
    def auto_encoder_spectral(self,lp):  
        input_img = Input(shape=(len(lp),))
        
        # "encoded" is the encoded representation of the input
        encoded = Dense(200, activation='relu')(input_img)
        encoded = Dense(100, activation='relu')(encoded)
        encoded = Dense(10, activation='sigmoid')(encoded)
        
        # "decoded" is the lossy reconstruction of the input
        decoded = Dense(10, activation='relu')(encoded)
        decoded = Dense(100, activation='relu')(decoded)
        decoded = Dense(200, activation='relu')(decoded)
        decoded = Dense(len(lp))(decoded)
        autoencoder = Model(input_img, decoded)
        # this model maps an input to its reconstruction
        
        encoder = Model(input_img, encoded)
        autoencoder.compile(optimizer='adam', loss='mse')
        autoencoder.fit(lp, lp, epochs=200, batch_size=5)
        pred = encoder.predict(lp)
        return pred
    
    def summarize_spectral(self,data,vectors,x):
        mean = []
        closest = []
        pdata=self.preprocess(data)
        split=self.split_sentences(pdata)
        print('tokeniztion done')
        #vectors = self.skipthought_encode(split)
        print('vetorization done')
        n_clusters = int(x)
        latent=self.auto_encoder(vectors)
        spectral = SpectralClustering(affinity='rbf',coef0=1,degree=3,eigen_solver=None,eigen_tol=0.0,gamma=1.0,n_clusters=n_clusters, random_state=0)
        spectral = spectral.fit(latent)
        af=spectral.affinity_matrix_
        lp=laplacian(af,normed=True)
        lp_latent=self.auto_encoder_spectral(lp)
        kmeans3 = KMeans(n_clusters=n_clusters, random_state=0)
        ts_vec3 = self.tsne.fit_transform(lp_latent)
        kmeans3 = kmeans3.fit(ts_vec3)
        for j in range(n_clusters):
            i = np.where(kmeans3.labels_ == j)[0]
            mean.append(np.mean(i))
        closest, _ = pairwise_distances_argmin_min(kmeans3.cluster_centers_,ts_vec3)
        ordering = sorted(range(n_clusters), key=lambda k: mean[k])
        summary3 = ' '.join([split[closest[kr]] for kr in ordering])
        print('summary formed')
        return summary3, kmeans3, ts_vec3
    
    def plot(self, ts_vec, kmeans):
        Y = ts_vec
        x_coords = Y[:, 0]
        y_coords = Y[:, 1]
        # display scatter plot
        #colors = np.array([x for x in 'bgcmy'])
        colors=cm.rainbow(np.linspace(0, 1, len(kmeans.labels_)))
        
        LABEL_COLOR_MAP = {}
        for i in range(len(kmeans.labels_)):
          LABEL_COLOR_MAP[i]=random.choice(colors)
        
        label_color = [LABEL_COLOR_MAP[l] for l in kmeans.labels_]
        
        plt.scatter(x_coords, y_coords,c=label_color)
        for label, x, y in zip(kmeans.labels_, x_coords, y_coords):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
        centers = kmeans.cluster_centers_
        
        x_cent = centers[:, 0]
        y_cent = centers[:, 1]
        plt.scatter(x_cent,y_cent, c='black')


def split_sentences(data):
    sent=[]
    sentences = sent_tokenize(data)
    for j in range(len(sentences)):
        sentences[j] = sentences[j].strip()
        if sentences[j] != '':
            sent.append(sentences[j])
    return sent 

