In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/Colab\ Notebooks/Minor\ Project\ Sem\ 6/

/content/drive/MyDrive/Colab Notebooks/Minor Project Sem 6


# Import all packages

In [None]:
import pandas as pd
import numpy as np

import nltk
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

from ast import literal_eval # to convert array string to array
from IPython.display import clear_output # to clear the large outputs

import re
import string
import operator
from math import log2



In [None]:
!python -m spacy download en_core_web_lg
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
clear_output()

In [None]:
!pip install 'scipy>=1.8'
!pip install 'networkx<2.7'
clear_output()

In [None]:
!pip install Keras-Preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from keras_preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
clear_output()

In [None]:
!pip install pyspellchecker
from spellchecker import SpellChecker
from collections import Counter
clear_output()

# Read data from csv file

In [None]:
df = pd.read_csv('duc2002finaldataset_0.csv')
df = df.iloc[:-1, :]
articles = df.drop('Summary', axis=1)
articles.head()

Unnamed: 0,Article
0,"['On the day of the Big Event, Ladbroke, the l..."
1,"[""Australian novelist Peter Carey was awarded ..."
2,"[""Six novels have been nominated for the Booke..."
3,"[""Japanese writer Kazuo Ishiguro won the 1989 ..."
4,"[""The Booker Prize is Britain's literary event..."


# Create functions for stop words removal and feature extractions

In [None]:
sw = stopwords.words('english') 

def remove_stopwords(article):
  filtered_article = []
  for sen in article:
    words = word_tokenize(sen)
    filtered_article.append(' '.join({w.lower() for w in words if w.isalpha() and w.lower() not in sw}))
    # print(filtered_article[-1])
  return filtered_article

In [None]:
# give a number to each sentance in article
def sentence_num(story):
    s=[]
    for i in range(len(story)):
        s.append("S"+ str(i))
    return s

In [None]:
#program to sentence position
def sentenceposition(story):
    sentenceLen = len(story)
    sentence_position = []

    for i in range(0, sentenceLen):
        sent_pos = round(((sentenceLen - i)/sentenceLen),2)
        sentence_position.append(sent_pos)

    return sentence_position             #sentence_position = (sentenceLen - i) / sentenceLen

In [None]:
def open_relation(article):
  length = []
  for sentence in article:
    word_tokens = word_tokenize(sentence)
    filtered_words = [word for word in word_tokens if word.lower() not in stopwords.words('english') and len(word)>1]
    length.append(len(filtered_words))
  length = np.array(length)
  max_len = max(length)
  length = length / max_len
  return length

In [None]:
#program to count sentence length of story
def sentencelength(story):
    story_len = len(story)
    sentence_count = []
    sentence_length = []
    max_word_count = 0

    for i in range(0, story_len):
        # using regex (findall())
        # to count words in string
        res = len(re.findall(r'\w+', story[i]))
        sentence_count.append(res)

        #word count of each sentence
        #print(res)

        if(res > max_word_count):
            max_word_count = res


    for j in range(0,len(sentence_count)):
        sentenceLen = round(sentence_count[j] / max_word_count,2)
        sentence_length.append(sentenceLen)
    
    return sentence_length

In [None]:
#program to count numeric data in sentence of story
def numericdata(story):
    numeric_data = []
    for i in range(0, len(story)):
        # using regex (findall())
        # to count words in string
        words_count = len(re.findall(r'\w+', story[i]))
        #print(res)
        pattern = '[0-9]+'
        numeric_count = len(re.findall(pattern, story[i]))
        #print(numeric_count)
        result = 0
        if(words_count != 0):
            result = result + numeric_count/words_count
        numeric_data.append(round(result,2))
    return numeric_data

In [None]:
#program to find number of named entity in each sentence
def NamedEntity(story):
    NER = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
    #creating a list to store the number of named entity in each sentence
    NamedEntity_=[]
    for i in range(len(story)):
        text= NER(story[i])
        #appending the number of named entity in each sentence to the list
        NamedEntity_.append(len(text.ents))
    if(max(NamedEntity_) != 0):
        NamedEntity_=[round(i/max(NamedEntity_),2) for i in NamedEntity_]
    
    return NamedEntity_

In [None]:
#Program to count PUNCTUATION MARKS
def specialcharecters(story):
    punctuation=[]
    for i in range(len(story)): 
        count = 0 
        for j in range(len(story[i])):  
            if story[i][j] in string.punctuation:
                count = count + 1    
        punctuation.append(count)
    
    if(max(punctuation) != 0):
        punctuation=[round(i/max(punctuation),2) for i in punctuation]
    
    return punctuation

In [None]:
# def thematicwords(story):
#     data = remove_stopwords(story)
#     frequency = {}
#     match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(data).lower())
#     for word in match_pattern:
#         count = frequency.get(word,0)
#         frequency[word] = count + 1
#     length= len(frequency)//4
#     freq_sort=sorted(frequency.items(), key=lambda x: x[1], reverse=True)
#     first_data = list(map(operator.itemgetter(0), freq_sort))
#     thematic_words = first_data[:length+1]
#     tw=[]
#     for i in range(len(data)):
#         count=0
#         tokenizer = nltk.RegexpTokenizer(r"\w+")
#         words = tokenizer.tokenize(data[i])
#         for j in range(len(words)):
#             if(words[j] in thematic_words):
#                 #print(words[j])
#                 count = count + 1
#         tw.append(count)  
#     if(max(tw) != 0):
#         thematic_words = [round(i/max(tw),2) for i in tw]
#     return thematic_words

In [None]:
#count no of uppercases 
def Uppercase(story):
    UpperCase = []
    for i in range(0, len(story)):
        countUpperCase = 0
        token = regexp_tokenize(story[i], "[\w']+")
        for j in token:
            if len(j) != 1 and j.isupper()==True:
                countUpperCase += 1
        UpperCase.append(countUpperCase)
    if(max(UpperCase) != 0):
            UpperCase=[round(i/max(UpperCase),2) for i in UpperCase]
    
    return UpperCase

In [None]:
def entropy(story):
    for i in range(len(story)):
        story[i] = story[i].lower()
    for i in range(len(story)):
        for character in string.punctuation:
             story[i] = story[i].replace(character, '')
    data = remove_stopwords(story)
    def counting(elements):
        # check if each word has '.' at its last. If so then ignore '.'
        if elements[-1] == '.':
            elements = elements[0:len(elements) - 1]

        # if there exists a key as "elements" then simply
        # increase its value.
        if elements in dictionary:
            dictionary[elements] += 1

        # if the dictionary does not have the key as "elements" 
        # then create a key "elements" and assign its value to 1.
        else:
            dictionary.update({elements: 1})
    totalCount = []
    for Sentence in data:
        dictionary = {}
        wordCount = []
        lst = Sentence.split()
        for elements in lst:
            counting(elements)
        for allKeys in dictionary:
            wordCount.append(dictionary[allKeys])
#             print ("Frequency of ", allKeys, end = " ")
#             print (":", end = " ")
#             print (dictionary[allKeys], end = " ")
#             print("-----------------") 
        totalCount.append(wordCount)
    lengthSentence = []    
    for i in range(0, len(data)):
        count = len(data[i].split())
        lengthSentence.append(count)
    def entropyCalculation(senList):
        entropy = 0
        i = 0
        length = lengthSentence[i]
        for freq in senList:
            if freq == 0:
              freq = 1
            if length == 0:
              length = 1
            prob = round(freq/length, 2)
            if prob == 0:
              prob = 0.001
            #print(-(prob * log2(prob)))
            entropy += -(prob * log2(prob))
            #print(entropy, " ")
        return entropy
    entropyTotal = []
    for i in range(0, len(totalCount)):
        #print(totalCount[i])
        ent = entropyCalculation(totalCount[i])
        entropyTotal.append(round(ent,2))    
    if(max(entropyTotal) != 0):
        entropyTotal=[round(i/max(entropyTotal),2) for i in entropyTotal]
    
    return entropyTotal

In [None]:
#Function to find incorrect words
#incorrect words
# find those words that may be misspelled

def incorrect(story):
    incorrectWord = []
    spell = SpellChecker()
    for i in range(len(story)):
        for character in string.punctuation:
             story[i] = story[i].replace(character, '')
    
    for i in range(0, len(story)):
        l = story[i].split()
        #print(l)
        misspelled = spell.unknown(l)
        count = 0
        for word in misspelled:
            count = count + 1
        incorrectWord.append(count)
    if(max(incorrectWord) != 0):
        incorrectWord=[round(i/max(incorrectWord),2) for i in incorrectWord]
    
    return incorrectWord

In [None]:
#Finding and updating Parts Of Speech (POS Tags)

def postags(story):
    Postags=[]
    postags_ct = []
    for i in range(len(story)):
        ct = 0
        #tokenize the words in the text
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        tokens = tokenizer.tokenize(story[i])
        #assign POS tags to each words
        pos = nltk.pos_tag(tokens)
        #Count the POS tags
        the_count = dict(Counter(tag for _, tag in pos))
        #appending the count of each pos tags in a sentence to a list
        Postags.append(the_count)
        keys = the_count.keys()
        #adding nouns and verbs together under pos category
        for i in keys:
            if(i == "NNP" or i =="NNPS" or i =="NN" or i =="NNS" or i =="VB" or i =="VBD" or i =="VBG" or i =="VBN" or i =="VBP" or i =="VBZ"):
                ct += the_count[i] 
        postags_ct.append(ct)
    if(max(postags_ct) != 0):
        postags_ct=[round(i/max(postags_ct),2) for i in postags_ct]
    return postags_ct

In [None]:
# Use this instead of manually finding tf_isf
def tf_isf(story):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(story)
    #feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    scores=[]
    for i in range(len(denselist)):
        score=0
        for j in range(len(denselist[i])):
            score+=denselist[i][j]
        scores.append(score)
    if(max(scores) != 0):
        scores=[round(i/max(scores),2) for i in scores]
    return scores

In [None]:
# def cosine_similarity(tf_isf_mat):
#   # Pad vectors to have the same length
#   padded_vectors = np.array(pad_sequences(tf_isf_mat, padding='post', value=0, dtype='float'))

#   # Compute cosine similarity between each pair of vectors
#   cosine_similarity_mat = cosine_similarity(padded_vectors)

#   # Compute cosine similarity between each pair of vectors

#   cosine_similarity_mat = np.array(cosine_similarity_mat)

#   # Convert similarity values to distances
#   distances = 1 - cosine_similarity_mat
#   distances = np.round(distances.clip(min=0), 2)
#   return distances

In [None]:
# def aggregation_similarity(cosine_similarity_mat):
#   return [sum(i) for i in cosine_similarity_mat]

In [None]:
# Use this instead of finding similarity manually
def sentence_similarity(story):
    Tfidf_vect = TfidfVectorizer()
    vector_matrix = Tfidf_vect.fit_transform(story)
    #tokens = Tfidf_vect.get_feature_names()
    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    cosines=[]
    for i in range(len(cosine_similarity_matrix)):
        cos=0
        for j in range(len(cosine_similarity_matrix[i])):
            cos= cos + cosine_similarity_matrix[i][j]
        cosines.append(cos)
    if(max(cosines) != 0):
        cosines=[round(i/max(cosines),2) for i in cosines]
    return cosines

In [None]:
def title_feature(story,title):
    title_features = []
    title_words = word_tokenize(title)
    length_title = len(title_words)
    for i in range(len(story)):
        score = 0
        sentence_words = word_tokenize(story[i])
        for word in sentence_words:
            if word in title_words:
                score += 1
        title_features.append(score)
    title_features=[i/length_title for i in title_features]
    return title_features

In [None]:
from scipy.sparse import coo_matrix
import networkx as nx
import matplotlib.pyplot as plt

def busy_path(article):
  Tfidf_vect = TfidfVectorizer()
  vector_matrix = Tfidf_vect.fit_transform(story)
  #tokens = Tfidf_vect.get_feature_names()
  cosine_similarity_matrix = cosine_similarity(vector_matrix)
  distances = 1 - cosine_similarity_matrix
  distances = np.round(distances.clip(min=0), 2)

  m = distances.copy()
  for i in range(len(m)):
    for j in range(len(m[i])):
      if m[i][j] < 0.95:
        m[i][j] = 0
    
  sparse_matrix = coo_matrix(m) # <18x18 sparse matrix of type '<class 'numpy.float64'>'	with 306 stored elements in COOrdinate format> # Means 306 nodes are there
  # Create graph
  G = nx.from_scipy_sparse_matrix(sparse_matrix)
  bushy_path_mat = [value for i, value in G.degree]
  return bushy_path_mat

In [None]:
def text_rank(article):
  Tfidf_vect = TfidfVectorizer()
  vector_matrix = Tfidf_vect.fit_transform(story)
  #tokens = Tfidf_vect.get_feature_names()
  cosine_similarity_matrix = cosine_similarity(vector_matrix)
  distances = 1 - cosine_similarity_matrix
  distances = np.round(distances.clip(min=0), 2)

  # Adjecency list calucate
  m = [[] for _ in range(len(distances))]

  for i in range(len(distances)):
    for j in range(len(distances[i])):
      if distances[i][j] >= 0.95:
        m[i].append(j)
  
  # Page rank calculation using custom furmula
  n = len(m)
  d, it = .85, 100
  rank = [1 for _ in range(n)]

  # Calculate rank for each iteration
  for _ in range(it):
    for i in range(n):
      rank[i] = (1-d) + d * sum([rank[x]/len(m[x]) for x in m[i]])

  return rank

# Convert all the features into a csv file for each article

In [None]:
all_data = None

# for i in range(2):
for i in range(len(articles['Article'])):
  print(i)
  story = literal_eval(articles['Article'][i])
  df = pd.DataFrame({
          'File Number ': "F" + str(i),
          'Sentence Number': sentence_num(story),
          'Sentence length': sentencelength(story),
          'Sentence Position': sentenceposition(story),
          'Numeric Data': numericdata(story),
          'Named Entity': NamedEntity(story),
          'Special Charecters': specialcharecters(story),
          # 'Thematic Words': thematicwords(story),
          'Upper Case': Uppercase(story),
          'Entropy': entropy(story),
          'Incorrect Word': incorrect(story),
          'POS Tags': postags(story),
          'Term Weight': tf_isf(story),
          'Cosine Similarity': sentence_similarity(story), # aggregation similarity
          # 'Title Feature': title_feature(story, title), # what is title here
          'Bushy Path': busy_path(story),
          'Text Rank': text_rank(story)
      })

  if all_data is None:
    all_data = df
  else:
    all_data = pd.concat([all_data, df], ignore_index=True)
  if i%20 == 0:
    clear_output()

In [None]:
all_data.head()
# all_data.shape
# all_data.tail()

Unnamed: 0,File Number,Sentence Number,Sentence length,Sentence Position,Numeric Data,Named Entity,Special Charecters,Upper Case,Entropy,Incorrect Word,POS Tags,Term Weight,Cosine Similarity,Busy Path,Text Rank
0,F0,S0,0.2,1.0,0.0,0.1,0.06,0.0,0.29,0.09,0.18,0.59,0.79,25,0.795211
1,F0,S1,0.28,0.98,0.35,0.28,0.21,0.0,0.26,0.27,0.29,0.68,0.47,42,1.264775
2,F0,S2,0.18,0.96,0.13,0.14,0.06,0.0,0.21,0.09,0.13,0.58,0.49,41,1.238773
3,F0,S3,0.23,0.94,0.0,0.0,0.05,0.0,0.24,0.0,0.18,0.65,0.6,33,1.010622
4,F0,S4,0.2,0.92,0.06,0.07,0.02,0.0,0.21,0.0,0.16,0.62,0.6,30,0.930889


In [None]:
all_data.shape

(15672, 15)

In [None]:
all_data.to_csv('features/features_for_all_articles.csv', index=False)