<a href="https://colab.research.google.com/github/harikrishnangit/Automatic-Text-Summarizer/blob/master/HARI_TEXT_SUMM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from operator import itemgetter
%matplotlib

Using matplotlib backend: agg


In [13]:
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'invalid': 'ignore', 'over': 'warn', 'under': 'ignore'}

In [15]:
import nltk
nltk.download('brown')
sentences = brown.sents('ca04')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [16]:
len(sentences)

88

In [17]:
sentlist=[' '.join(sent) for sent in sentences]

In [18]:
wholesent=""
for i in sentlist:
    wholesent+=i;
print(wholesent)



In [19]:
stop_words = set(stopwords.words("english"))
punctuations = set(string.punctuation)
pos_tags = {
            NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
            VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
            ADJ: ['JJ', 'JJR', 'JJS'],
            ADV: ['RB', 'RBR', 'RBS', 'WRB']
}

In [20]:
def remove_stop_words(words):
        return [w for w in words if w not in stop_words]

In [21]:
def remove_regex(sent):
        sent = " ".join([w.lower() for w in sent])
        sent = re.sub(r"i'm", "i am", sent)
        sent = re.sub(r"he's", "he is", sent)
        sent = re.sub(r"can't", "cannot", sent)
        sent = re.sub(r"don't", "do not", sent)
        sent = re.sub(r"that's", "that is", sent)
        sent = re.sub(r"\'ve", " have", sent)
        sent = re.sub(r"\'ll", " will", sent)
        sent = re.sub(r"what's", "what is", sent)
        sent = re.sub(r"where's", "where is", sent)
        sent = re.sub(r"\'re", " are", sent)
        sent = re.sub(r"\'d", " would", sent)
        sent = re.sub(r"she's", "she is", sent)
        sent = re.sub(r"won't", "will not", sent)
        patterns = re.finditer("#[\w]*", sent)
        for pattern in patterns:
            sent = re.sub(pattern.group().strip(), "", sent)
        sent = "".join(ch for ch in sent if ch not in punctuations)
        return sent

In [22]:
def posTagging(words):
        tagged_words = pos_tag(words)
        pos_words = []
        for word in tagged_words:
            flag = False
            for key, value in pos_tags.items():
                if word[1] in value:
                    pos_words.append((word[0], key))
                    flag = True
                    break
            if not flag:
                pos_words.append((word[0], NOUN))
        return pos_words

In [23]:
def preprocessData(sentence):
    sentence= remove_regex(sentence)
    words = word_tokenize(sentence)
    cleanedWords = remove_stop_words(words)
    lem = WordNetLemmatizer()
    pos_words = posTagging(words)
    cleanedWords = [lem.lemmatize(w, pos=p) for w, p in pos_words]
    return cleanedWords

In [24]:
def findSentenceSimilarity(s1, s2):
    s1 = preprocessData(s1)
    s2 = preprocessData(s2)
    allWords = list(set(s1 + s2))
    vectorForS1 = [0] * len(allWords)
    vectorForS2 = [0] * len(allWords)
    for word in s1:
        vectorForS1[allWords.index(word)] += 1
    for word in s2:
        vectorForS2[allWords.index(word)] += 1
    return 1 - cosine_distance(vectorForS1, vectorForS2)

In [25]:
def createSimilarityMatrix(sentences):
    matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            else:
                matrix[i][j] = findSentenceSimilarity(sentences[i], sentences[j])
    for i in range(len(matrix)):
        matrix[i] /= matrix[i].sum()
    return matrix

In [30]:
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [31]:
SimilarityMatrix = createSimilarityMatrix(sentences)

Ranking sentences using PageRank Algorithm

In [32]:
def pagerank(matrix, eps=1.0e-8, d=0.85):
    N = matrix.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * matrix) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

In [33]:
ranks = pagerank(SimilarityMatrix)

In [34]:
ranks

array([[0.00388809],
       [0.00537445],
       [0.00368312],
       [0.00220138],
       [0.00160533],
       [0.0116334 ],
       [0.01353148],
       [0.00466342],
       [0.00745492],
       [0.02006356],
       [0.02205361],
       [0.0030325 ],
       [0.0111093 ],
       [0.00806791],
       [0.00502207],
       [0.0149853 ],
       [0.01631649],
       [0.00886955],
       [0.02362489],
       [0.00125545],
       [0.0033375 ],
       [0.00161526],
       [0.00982682],
       [0.02179603],
       [0.00953158],
       [0.01985539],
       [0.01907658],
       [0.00737893],
       [0.01823641],
       [0.00244146],
       [0.0080199 ],
       [0.0039383 ],
       [0.01742187],
       [0.00165717],
       [0.00566927],
       [0.01358015],
       [0.0114422 ],
       [0.0234917 ],
       [0.00436433],
       [0.0171251 ],
       [0.00529819],
       [0.0164968 ],
       [0.01936035],
       [0.0173704 ],
       [0.01846608],
       [0.01756296],
       [0.00232095],
       [0.001

In [35]:
sortRankWithindexes = [item[0] for item in sorted(enumerate(ranks), key=lambda item: -item[1])]

In [36]:
sortRankWithindexes

[18,
 37,
 58,
 10,
 23,
 70,
 73,
 79,
 81,
 68,
 84,
 9,
 25,
 63,
 42,
 71,
 50,
 26,
 87,
 72,
 44,
 28,
 86,
 83,
 45,
 32,
 43,
 39,
 74,
 41,
 16,
 85,
 54,
 15,
 65,
 35,
 6,
 80,
 48,
 51,
 5,
 52,
 36,
 12,
 64,
 22,
 24,
 17,
 61,
 13,
 30,
 66,
 77,
 55,
 8,
 27,
 49,
 78,
 59,
 34,
 62,
 60,
 75,
 1,
 40,
 14,
 7,
 57,
 38,
 82,
 31,
 0,
 2,
 67,
 20,
 11,
 76,
 69,
 53,
 56,
 29,
 46,
 3,
 47,
 33,
 21,
 4,
 19]

In [37]:
SummaryLines = 5

Selecting top sentences for summary

In [38]:
selectedSentences = sorted(sortRankWithindexes[:SummaryLines])

In [39]:
summary = itemgetter(*selectedSentences)(sentences)

In [40]:
finalSummary=""
for sentence in summary:
    finalSummary+=' '.join(sentence)

Final Summary

In [41]:
finalSummary

'Canada alone has been somewhat out of step with the Oslo attempt to get all the allied cars back on the track behind the NATO locomotive .The Secretary of State himself , in his first speech , gave some idea of the tremendous march of events inside and outside the United States that has preoccupied the new administration in the past four months .The annual spring meeting has given an impetus in three main directions : more , deeper , and more timely political consultation within the alliance , the use of the Organization for Economic Cooperation and Development ( when ratified ) as a method of coordinating aid to the underdeveloped countries , and the need for strengthening conventional forces as well as the maintenance of the nuclear deterrent .Its spokesmen insist that there has not been time enough to institute reforms in military and economic aid policies in the critical areas .Very early in his administration he informed the Kremlin through diplomatic channels , a high official s

BLEU

In [53]:
#two references for one document
from nltk.translate.bleu_score import corpus_bleu
sentences = [["OsloThe most positive element to emerge from the Oslo meeting of North Atlantic Treaty Organization Foreign Ministers has been the freer , franker , and wider discussions , animated by much better mutual understanding than in past meetings .This has been a working session of an organization that , by its very nature , can only proceed along its route step by step and without dramatic changes .In Oslo , the ministers have met in a climate of candor , and made a genuine attempt to get information and understanding one another's problems .This atmosphere of understanding has been particularly noticeable where relations are concerned between the `` colonialist '' powers and those who have never , or not for a long time , had such problems .The nightmare of a clash between those in trouble in Africa , exacerbated by the difficulties , changes , and tragedies facing them , and other allies who intellectually and emotionally disapprove of the circumstances that have brought these troubles about , has been conspicuous by its absence .Explosion avoidedIn the case of Portugal , which a few weeks ago was rumored ready to walk out of the NATO Council should critics of its Angola policy prove harsh , there has been a noticeable relaxation of tension .The general , remarkably courteous , explanation has left basic positions unchanged , but there has been no explosion in the council .There should even be no more bitter surprises in the UN General Assembly as to NATO members' votes , since a new ad hoc NATO committee has been set up so that in the future such topics as Angola will be discussed in advance .Canada alone has been somewhat out of step with the Oslo attempt to get all the allied cars back on the track behind the NATO locomotive .Even Norway , despite daily but limited manifestations against atomic arms in the heart of this northernmost capital of the alliance , is today closer to the NATO line .On the negative side of the balance sheet must be set some disappointment that the United States leadership has not been as much in evidence as hoped for .One diplomat described the tenor of Secretary of State Dean Rusk's speeches as `` inconclusive '' .But he hastened to add that , if United States policies were not always clear , despite Mr. Rusk's analysis of the various global danger points and setbacks for the West , this may merely mean the new administration has not yet firmly fixed its policy .Exploratory moodA certain vagueness may also be caused by tactical appreciation of the fact that the present council meeting is a semipublic affair , with no fewer than six Soviet correspondents accredited .The impression has nevertheless been given during these three days , despite Mr. Rusk's personal popularity , that the United States delegation came to Oslo in a somewhat tentative and exploratory frame of mind , more ready to listen and learn than to enunciate firm policy on a global scale with detailed application to individual danger spots .The Secretary of State himself , in his first speech , gave some idea of the tremendous march of events inside and outside the United States that has preoccupied the new administration in the past four months .But where the core of NATO is concerned , the Secretary of State has not only reiterated the United States' profound attachment to the alliance , `` cornerstone '' of its foreign policy , but has announced that five nuclear submarines will eventually be at NATO's disposal in European waters .The Secretary of State has also solemnly repeated a warning to the Soviet Union that the United States will not stand for another setback in Berlin , an affirmation once again taken up by the council as a whole .Conflict surveyedThe secretary's greatest achievement is perhaps the rekindling of NATO realization that East-West friction , wherever it take place around the globe , is in essence the general conflict between two entirely different societies , and must be treated as such without regard to geographical distance or lack of apparent connection .The annual spring meeting has given an impetus in three main directions : more , deeper , and more timely political consultation within the alliance , the use of the Organization for Economic Cooperation and Development ( when ratified ) as a method of coordinating aid to the underdeveloped countries , and the need for strengthening conventional forces as well as the maintenance of the nuclear deterrent .This increase in the `` threshold '' , as the conventional forces strengthening is called , will prove one of the alliance's most difficult problems in the months to come .Each ally will have to carry out obligations long since laid down , but never completely fulfilled .WashingtonThe Kennedy administration moves haltingly toward a Geneva conference on Laos just as serious debate over its foreign policy erupts for the first time .There is little optimism here that the Communists will be any more docile at the conference table than they were in military actions on the ground in Laos .The United States , State Department officials explain , now is mainly interested in setting up an international inspection system which will prevent Laos from being used as a base for Communist attacks on neighboring Thailand and South Viet Nam .They count on the aid of the neutral countries attending the Geneva conference to achieve this .The United States hopes that any future Lao Cabinet would not become Communist dominated .But it is apparent that no acceptable formula has been found to prevent such a possibility .Policies modifiedThe inclination here is to accept a de facto cease-fire in Laos , rather than continue to insist on a verification of the cease-fire by the international control commission before participating in the Geneva conference .This is another of the modifications of policy on Laos that the Kennedy administration has felt compelled to make .It excuses these actions as being the chain reaction to basic errors made in the previous administration .Its spokesmen insist that there has not been time enough to institute reforms in military and economic aid policies in the critical areas .But with the months moving on -- and the immediate confrontations with the Communists showing no gain for the free world -- the question arises :How effective have Kennedy administration first foreign policy decisions been in dealing with Communist aggression ? ?Former Vice-President Richard M. Nixon in Detroit called for a firmer and tougher policy toward the Soviet Union .He was critical of what he feels is President Kennedy's tendency to be too conciliatory .GOP restrainedIt does not take a Gallup poll to find out that most Republicans in Congress feel this understates the situation as Republicans see it .They can hardly restrain themselves from raising the question of whether Republicans , if they had been in power , would have made `` amateurish and monumental blunders '' in Cuba .One Republican senator told this correspondent that he was constantly being asked why he didn't attack the Kennedy administration on this score .His reply , he said , was that he agreed to the need for unity in the country now .But he further said that it was better politics to let others question the wisdom of administration policies first .The Republicans some weeks ago served notice through Senator Thruston B. Morton ( R ) of Kentucky , chairman of the Republican National Committee , that the Kennedy administration would be held responsible if the outcome in Laos was a coalition government susceptible of Communist domination .Kennedy administration policies also have been assailed now from another direction by 70 Harvard , Boston University , Brandeis , and Massachusetts Institute of Technology educators .Detente urgedThis group pleads with the administration to `` give no further support for the invasion of Cuba by exile groups '' .It recommends that the United States `` seek instead to detach the Castro regime from the Communist bloc by working for a diplomatic detente and a resumption of trade relations ; ;and concentrate its constructive efforts on eliminating in other parts of Latin America the social conditions on which totalitarian nationalism feeds '' .Mr. Nixon , for his part , would oppose intervention in Cuba without specific provocation .But he did recommend that President Kennedy state clearly that if Communist countries shipped any further arms to Cuba that it would not be tolerated .Until the Cuban fiasco and the Communist military victories in Laos , almost any observer would have said that President Kennedy had blended a program that respected , generally , the opinions voiced both by Mr. Nixon and the professors .Aid plans revampedVery early in his administration he informed the Kremlin through diplomatic channels , a high official source disclosed , that the new administration would react even tougher than the Eisenhower administration would during the formative period of the administration .Strenuous efforts were made to remove pin pricking from administration statements .Policies on nuclear test ban negotiations were reviewed and changed .But thus far there has been no response in kind .Foreign aid programs were revamped to give greater emphasis to economic aid and to encourage political reform in recipient nations .In Laos , the administration looked at the Eisenhower administration efforts to show determination by sailing a naval fleet into Southeast Asian waters as a useless gesture .Again and again it asked the Communists to `` freeze '' the military situation in Laos .But the Communists aided the Pathet Lao at an even faster rate .And after several correspondents went into Pathet Lao territory and exposed the huge build-up , administration spokesmen acclaimed them for performing a `` great service '' and laid the matter before the Southeast Asia Treaty Organization .SEATO was steamed up and prepared contingency plans for coping with the military losses in Laos .But the Communists never gave sufficient provocation at any one time for the United States to want to risk a limited or an all-out war over Laos .( Some SEATO nations disagreed , however .)There was the further complication that the administration had very early concluded that Laos was ill suited to be an ally , unlike its more determined neighbors , Thailand and South Viet Nam .The administration declared itself in favor of a neutralized Laos .The pro-Western government , which the United States had helped in a revolt against the Souvanna Phouma `` neutralist '' government , never did appear to spark much fighting spirit in the Royal Lao Army .There certainly was not any more energy displayed after it was clear the United States would not back the pro-Western government to the hilt .If the administration ever had any ideas that it could find an acceptable alternative to Prince Souvanna Phouma , whom it felt was too trusting of Communists , it gradually had to relinquish them .One factor was the statement of Senator J. W. Fulbright ( D ) of Arkansas , chairman of the Senate Foreign Relations Committee .He declared on March 25 that the United States had erred a year and a half ago by `` encouraging the removal '' of Prince Souvanna .WashingtonThe White House is taking extraordinary steps to check the rapid growth of juvenile delinquency in the United States .The President is deeply concerned over this problem and its effect upon the `` vitality of the nation '' .In an important assertion of national leadership in this field , he has issued an executive order establishing the President's committee on Juvenile Delinquency and Crime , to be supported and assisted by a Citizens Advisory Council of recognized authorities on juvenile problems .The President asks the support and cooperation of Congress in his efforts through the enactment of legislation to provide federal grants to states for specified efforts in combating this disturbing crime trend .Offenses multiplyThe President has also called upon the Attorney General , the Secretary of Health , Education and Welfare , and the Secretary of Labor to coordinate their efforts `` in the development of a program of federal leadership to assist states and local communities in their efforts to cope with the problem .Simultaneously the President announced Thursday the appointment of David L. Hackett , a special assistant to the Attorney General , as executive director of the new Committee on Juvenile Delinquency and Youth Crime .His sense of urgency in this matter stems from the fact that court cases and juvenile arrests have more than doubled since 1948 , each year showing an increase in offenders .Among arrests reported by the Federal Bureau of Investigation in 1959 , about half for burglary and larceny involved persons under 18 years of age."]]

finalSummary = [["Canada alone has been somewhat out of step with the Oslo attempt to get all the allied cars back on the track behind the NATO locomotive .The Secretary of State himself , in his first speech , gave some idea of the tremendous march of events inside and outside the United States that has preoccupied the new administration in the past four months .The annual spring meeting has given an impetus in three main directions : more , deeper , and more timely political consultation within the alliance , the use of the Organization for Economic Cooperation and Development ( when ratified ) as a method of coordinating aid to the underdeveloped countries , and the need for strengthening conventional forces as well as the maintenance of the nuclear deterrent .Its spokesmen insist that there has not been time enough to institute reforms in military and economic aid policies in the critical areas .Very early in his administration he informed the Kremlin through diplomatic channels , a high official source disclosed , that the new administration would react even tougher than the Eisenhower administration would during the formative period of the administration ."]]
score = corpus_bleu(sentences, finalSummary)
print(score)

0
