In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/Colab\ Notebooks/Minor\ Project\ Sem\ 6/

/content/drive/MyDrive/Colab Notebooks/Minor Project Sem 6


# Import packages

In [None]:
import pandas as pd
import numpy as np

from ast import literal_eval # to convert array string to array
from IPython.display import clear_output # to clear the large outputs

In [None]:
# !pip install tensorflow
# !pip install tensorflow-hub
!pip install sentence-transformers
clear_output()

In [None]:
from sentence_transformers import SentenceTransformer
import tensorflow as tf

# Load features csv and articles data

In [None]:
articles_df = pd.read_csv('duc2002finaldataset_0.csv')
articles_df = articles_df.iloc[:-1, :]
articles = articles_df.drop('Summary', axis=1)
articles_df.head()

Unnamed: 0,Article,Summary
0,"['On the day of the Big Event, Ladbroke, the l...","['Penelope Lively won the 1987 Booker Prize.',..."
1,"[""Australian novelist Peter Carey was awarded ...","[""The coveted Booker Prize for the year's best..."
2,"[""Six novels have been nominated for the Booke...","[""The winner of the 1989 Booker Prize, Britain..."
3,"[""Japanese writer Kazuo Ishiguro won the 1989 ...",['It was announced Thursday that Kazuo Ishigur...
4,"[""The Booker Prize is Britain's literary event...",['The Booker Prize has become internationally ...


In [None]:
features_df = pd.read_csv('features/features_for_all_articles.csv')
features_df.head()

Unnamed: 0,File Number,Sentence Number,Sentence length,Sentence Position,Numeric Data,Named Entity,Special Charecters,Upper Case,Entropy,Incorrect Word,POS Tags,Term Weight,Cosine Similarity,Busy Path,Text Rank
0,F0,S0,0.2,1.0,0.0,0.1,0.06,0.0,0.29,0.09,0.18,0.59,0.79,25,0.795211
1,F0,S1,0.28,0.98,0.35,0.28,0.21,0.0,0.26,0.27,0.29,0.68,0.47,42,1.264775
2,F0,S2,0.18,0.96,0.13,0.14,0.06,0.0,0.21,0.09,0.13,0.58,0.49,41,1.238773
3,F0,S3,0.23,0.94,0.0,0.0,0.05,0.0,0.24,0.0,0.18,0.65,0.6,33,1.010622
4,F0,S4,0.2,0.92,0.06,0.07,0.02,0.0,0.21,0.0,0.16,0.62,0.6,30,0.930889


# Word2Vec and save embedding for each word and each sentance in saperate file

## Create vocabulary

In [None]:
# Remove stopwords
import nltk
import string
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
clear_output()

In [None]:
def remove_stopwords(text):
    return ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words])

In [None]:
# create a vocabaury of all the words in the articles
vocabulary_for_sentances = []
for article in articles['Article']:
    article = literal_eval(article)
    for sentence in article:
        # remove stopwords and punctuations
        vocabulary = set()
        sentence = remove_stopwords(sentence)
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        # sentance = sentance.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        for word in sentence.split():
            vocabulary.add(word)
        vocabulary_for_sentances.append(list(vocabulary))

In [None]:
vocabulary = list(vocabulary_for_sentances)
print(vocabulary)
len(vocabulary)



15672

## Generate embedding for all words using word2vec

In [None]:
# Generate embedding for each word in the vocabulary using word2vec
from gensim.models import Word2Vec
model = Word2Vec(sentences=vocabulary, size=100, window=5, min_count=1, workers=4)
model.save('model/word2vec.model')

In [None]:
model.wv['beaches']

array([-4.0138725e-02,  5.3139959e-02,  5.3239394e-02,  5.8236115e-02,
        3.6620263e-02,  5.0354172e-03,  4.1067336e-02, -7.5291884e-03,
       -4.0675092e-02, -1.3706766e-02, -2.2975223e-02, -3.6411785e-02,
       -1.4581326e-04,  7.1514938e-03,  1.7468752e-02,  9.8996731e-03,
        7.8315698e-02, -1.0805235e-05,  5.7081789e-02, -5.1528723e-03,
       -8.0025665e-02, -2.9411117e-02,  7.5863086e-02, -9.3993261e-02,
       -1.5363521e-02,  9.5239058e-03, -1.7320925e-02,  7.7854089e-02,
        1.1783149e-02,  1.5897101e-02,  8.4008003e-04, -1.1675897e-02,
        2.1399591e-02,  2.2255492e-03, -1.9204168e-02,  1.6867755e-02,
       -6.6198550e-02,  9.6405158e-03, -1.1812949e-03,  8.9865640e-02,
       -4.3952810e-03, -1.7851185e-02,  6.3310273e-02, -1.7654520e-03,
        1.7199812e-02,  2.6563497e-02, -4.0780447e-02, -6.3883558e-02,
       -1.3723057e-02, -1.0169312e-02, -2.2983680e-02,  2.4809042e-02,
        8.3065722e-03, -9.7472211e-03, -7.6985122e-03,  1.4522134e-02,
      

## Generate sentence embedding 

In [None]:
# Generate embedding for each sentence in the article
def generate_sentence_embedding(sentence):
    sentence = remove_stopwords(sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence_embedding = np.zeros(100)
    for word in sentence.split():
        sentence_embedding += model.wv[word]
    return sentence_embedding

In [None]:
def generate_article_embedding(article):
  embeddings = []
  for sentence in article:
    try:
      sentence = remove_stopwords(sentence)
      sentence = sentence.translate(str.maketrans('', '', string.punctuation))
      embeddings.append(generate_sentence_embedding(sentence))
    except Exception as e:
      print(e)
  return embeddings

In [None]:
# article = literal_eval(articles['Article'][0])
# generate_article_embedding(article)

# Create a file for embeding values as features for each sentance

In [None]:
# give a number to each sentance in article
def sentence_num(story):
    s=[]
    for i in range(len(story)):
        s.append("S"+ str(i))
    return s

In [None]:
all_data = None

# for i in range(2):
for i in range(len(articles['Article'])):
    print(i)
    file_number = f'F{i}'
    story = literal_eval(articles['Article'][i])
    columns = ['File Number', 'Sentence Number'] + [f'Feature {i}' for i in range(100)]
    df = pd.DataFrame(columns=columns)
    rows = []
    for j in range(len(story)):
        sentence_number = f'S{j}'
        sentence_embedding = generate_sentence_embedding(story[j])
        row = [file_number, sentence_number] + list(sentence_embedding)
        rows.append(row)
    df = pd.DataFrame(rows, columns=columns)

    if all_data is None:
        all_data = df
    else:
        all_data = pd.concat([all_data, df], ignore_index=True)

    if i%20 == 0:
        clear_output()

561
562
563
564
565
566


In [None]:
all_data.head()
# all_data.shape
# all_data.tail()

Unnamed: 0,File Number,Sentence Number,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,F0,S0,-3.028201,4.83232,4.231953,4.661569,2.680747,0.542359,3.489277,-0.854364,...,-1.748158,-0.008537,-2.965145,0.0372,-0.725874,1.379129,5.838361,-5.495393,-1.578068,0.58238
1,F0,S1,-1.067481,1.704129,1.477471,1.638834,0.951621,0.210895,1.223655,-0.292578,...,-0.620279,0.001532,-1.055982,-0.019182,-0.261395,0.500029,2.046245,-1.950601,-0.557596,0.229147
2,F0,S2,-1.027387,1.618295,1.44424,1.572356,0.9324,0.185312,1.179554,-0.292644,...,-0.595019,0.003834,-1.017639,0.01466,-0.255877,0.492618,2.004057,-1.868744,-0.532951,0.182463
3,F0,S3,-0.830946,1.347649,1.17164,1.297845,0.752328,0.15207,0.952804,-0.236401,...,-0.482574,-0.001243,-0.802313,-0.003544,-0.182841,0.353769,1.607262,-1.524786,-0.439784,0.153994
4,F0,S4,-3.489311,5.500731,4.853541,5.359961,3.109786,0.625275,3.97232,-0.976728,...,-2.001764,0.033193,-3.420069,0.036575,-0.854733,1.610474,6.721723,-6.319908,-1.767494,0.665776


In [None]:
all_data.to_csv('features/embeddings_using_word2vec.csv', index=False)

## Add similarity of each sentannce with corresponding summary 

In [None]:
similarity_with_summary_df = pd.read_csv('features/features_with_summary_similarity.csv')
similarity_with_summary_df.head()

Unnamed: 0,File Number,Sentence Number,Sentence length,Sentence Position,Numeric Data,Named Entity,Special Charecters,Upper Case,Entropy,Incorrect Word,POS Tags,Term Weight,Cosine Similarity,Busy Path,Text Rank,Similarity with Summary
0,F0,S0,0.2,1.0,0.0,0.1,0.06,0.0,0.29,0.09,0.18,0.59,0.79,25,0.795211,0.62
1,F0,S1,0.28,0.98,0.35,0.28,0.21,0.0,0.26,0.27,0.29,0.68,0.47,42,1.264775,0.56
2,F0,S2,0.18,0.96,0.13,0.14,0.06,0.0,0.21,0.09,0.13,0.58,0.49,41,1.238773,0.5
3,F0,S3,0.23,0.94,0.0,0.0,0.05,0.0,0.24,0.0,0.18,0.65,0.6,33,1.010622,0.31
4,F0,S4,0.2,0.92,0.06,0.07,0.02,0.0,0.21,0.0,0.16,0.62,0.6,30,0.930889,0.63


In [None]:
similarity_with_summary = similarity_with_summary_df[['Similarity with Summary']]
similarity_with_summary.head()

Unnamed: 0,Similarity with Summary
0,0.62
1,0.56
2,0.5
3,0.31
4,0.63


In [None]:
all_data_with_similarity_scores = pd.concat([all_data, similarity_with_summary], axis=1)
all_data_with_similarity_scores.head()

Unnamed: 0,File Number,Sentence Number,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,...,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99,Similarity with Summary
0,F0,S0,-3.028201,4.83232,4.231953,4.661569,2.680747,0.542359,3.489277,-0.854364,...,-0.008537,-2.965145,0.0372,-0.725874,1.379129,5.838361,-5.495393,-1.578068,0.58238,0.62
1,F0,S1,-1.067481,1.704129,1.477471,1.638834,0.951621,0.210895,1.223655,-0.292578,...,0.001532,-1.055982,-0.019182,-0.261395,0.500029,2.046245,-1.950601,-0.557596,0.229147,0.56
2,F0,S2,-1.027387,1.618295,1.44424,1.572356,0.9324,0.185312,1.179554,-0.292644,...,0.003834,-1.017639,0.01466,-0.255877,0.492618,2.004057,-1.868744,-0.532951,0.182463,0.5
3,F0,S3,-0.830946,1.347649,1.17164,1.297845,0.752328,0.15207,0.952804,-0.236401,...,-0.001243,-0.802313,-0.003544,-0.182841,0.353769,1.607262,-1.524786,-0.439784,0.153994,0.31
4,F0,S4,-3.489311,5.500731,4.853541,5.359961,3.109786,0.625275,3.97232,-0.976728,...,0.033193,-3.420069,0.036575,-0.854733,1.610474,6.721723,-6.319908,-1.767494,0.665776,0.63


In [None]:
all_data_with_similarity_scores.tail()

Unnamed: 0,File Number,Sentence Number,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,...,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99,Similarity with Summary
15667,F566,S45,-4.151008,6.573165,5.814666,6.359786,3.725112,0.712817,4.793703,-1.130743,...,0.027273,-4.073062,0.098641,-1.068984,1.913758,8.06708,-7.576908,-2.082473,0.772833,0.59
15668,F566,S46,-2.585428,4.096621,3.607662,3.95506,2.307509,0.458613,2.982831,-0.704126,...,0.012664,-2.536108,0.040401,-0.631453,1.17835,5.01263,-4.697132,-1.322335,0.49943,0.39
15669,F566,S47,-5.030781,7.943702,7.032576,7.708867,4.500396,0.920679,5.801354,-1.412885,...,0.02383,-4.966253,0.093652,-1.223843,2.308005,9.725066,-9.13899,-2.577048,0.986001,0.7
15670,F566,S48,-2.132449,3.359413,2.987901,3.258861,1.916658,0.361006,2.447541,-0.576761,...,0.024723,-2.08688,0.058537,-0.526186,0.999295,4.118636,-3.877846,-1.077045,0.396321,0.26
15671,F566,S49,-2.787782,4.45905,3.912321,4.319159,2.51217,0.524069,3.229208,-0.799691,...,0.004632,-2.770906,0.054361,-0.694326,1.29147,5.423788,-5.111857,-1.418233,0.549291,0.58


In [None]:
all_data_with_similarity_scores.to_csv('features/embeddings_using_word2vec.csv', index=False)

# Use autoencoder to reduce number of features from 100 to 50

In [None]:
# use autoencoder to reduce the dimensionality of the sentence embeddings to 50
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

input_dim = 100
encoding_dim = 50

input_layer = Input(shape=(input_dim, ))
encoder = Dense(75, activation="sigmoid")(input_layer)
encoder = Dense(50, activation="relu")(encoder)
decoder = Dense(input_dim, activation='relu')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
autoencoder.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 dense_3 (Dense)             (None, 75)                7575      
                                                                 
 dense_4 (Dense)             (None, 50)                3800      
                                                                 
 dense_5 (Dense)             (None, 100)               5100      
                                                                 
Total params: 16,475
Trainable params: 16,475
Non-trainable params: 0
_________________________________________________________________


In [None]:
autoencoder.fit(all_data.iloc[:, 2:], all_data.iloc[:, 2:], epochs=50, batch_size=256, shuffle=True, validation_split=0.20)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fbd099a51c0>

In [None]:
# Reduce the dimensionality of the sentence embeddings to 50
encoder = Model(inputs=input_layer, outputs=encoder)
encoded_data = encoder.predict(all_data.iloc[:, 2:])
encoded_data = pd.DataFrame(encoded_data)
encoded_data.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1.516613,0.0,2.122803,1.724144,0.0,3.763345,0.0,2.287347,0.0,1.896117,...,0.0,0.0,1.570699,0.0,0.0,2.332792,0.0,0.0,0.0,0.0
1,0.526142,0.0,2.598977,2.032654,0.0,1.871567,0.0,0.89688,0.0,0.859863,...,0.0,0.0,0.511037,0.0,0.0,0.927382,0.0,0.0,0.0,0.0
2,0.512045,0.0,2.599236,2.03824,0.0,1.84156,0.0,0.880053,0.0,0.843693,...,0.0,0.0,0.50168,0.0,0.0,0.909808,0.0,0.0,0.0,0.0
3,0.427256,0.0,2.656162,2.060126,0.0,1.666738,0.0,0.740195,0.0,0.747069,...,0.0,0.0,0.401794,0.0,0.0,0.775449,0.0,0.0,0.0,0.0
4,1.756654,0.0,2.001099,1.638156,0.0,4.209085,0.0,2.596275,0.0,2.136415,...,0.0,0.0,1.823758,0.0,0.0,2.639998,0.0,0.0,0.0,0.0


In [None]:
encoded_data_with_similarity_scores = pd.concat([all_data[['File Number', 'Sentence Number']], encoded_data, similarity_with_summary], axis=1)
encoded_data_with_similarity_scores.head()

Unnamed: 0,File Number,Sentence Number,0,1,2,3,4,5,6,7,...,41,42,43,44,45,46,47,48,49,Similarity with Summary
0,F0,S0,1.516613,0.0,2.122803,1.724144,0.0,3.763345,0.0,2.287347,...,0.0,1.570699,0.0,0.0,2.332792,0.0,0.0,0.0,0.0,0.62
1,F0,S1,0.526142,0.0,2.598977,2.032654,0.0,1.871567,0.0,0.89688,...,0.0,0.511037,0.0,0.0,0.927382,0.0,0.0,0.0,0.0,0.56
2,F0,S2,0.512045,0.0,2.599236,2.03824,0.0,1.84156,0.0,0.880053,...,0.0,0.50168,0.0,0.0,0.909808,0.0,0.0,0.0,0.0,0.5
3,F0,S3,0.427256,0.0,2.656162,2.060126,0.0,1.666738,0.0,0.740195,...,0.0,0.401794,0.0,0.0,0.775449,0.0,0.0,0.0,0.0,0.31
4,F0,S4,1.756654,0.0,2.001099,1.638156,0.0,4.209085,0.0,2.596275,...,0.0,1.823758,0.0,0.0,2.639998,0.0,0.0,0.0,0.0,0.63


In [None]:
encoded_data_with_similarity_scores.to_csv('features/embeddings_using_word2vec_and_autoencoder.csv', index=False)