#Project: Text Summarization
 ## statistical NLP approach

In [9]:
!pip install nltk
!pip install scikit-learn



In [38]:
pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [42]:
import fitz  # PyMuPDF for PDF text extraction
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")

# Provide a local PDF path (NOT a Google Drive link)
pdf_path = "/content/drive/MyDrive/Mitacs’ Terms of Use.pdf"  # Change this to the actual file path

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)  # Open the PDF
        for page in doc:
            text += page.get_text("text")  # Extract text from each page
        if not text:
            print("Warning: No text extracted. Check the PDF format.")
    except Exception as e:
        print(f"Error: {e}")
    return text

# Extract text from PDF
article_text = extract_text_from_pdf(pdf_path)

# Tokenize into sentences
if article_text:
    sentences = sent_tokenize(article_text)
    print("\nSentences in this article:")
    for i, sentence in enumerate(sentences[:5]):  # Display first 5 sentences
        print(f"{i+1}. {sentence}")
else:
    print("No text found in the PDF.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Sentences in this article:
1.  
 
  
 
Mitacs’s Terms of Use 
 
 
By selecting "Yes" to this section, I: 
 
• 
Agree to provide Mitacs with accurate, complete and detailed information only.
2. • 
Agree to update any data that is out-of-date, comply with all timelines requiring the submission of 
information where it is reasonable for me to do so, and input changes to information as I receive it, in 
a timely way.
3. • 
Recognize that Mitacs uses the information provided to arrange supports and services for my internship 
and to dispense my award payments.
4. I will not hold Mitacs liable should I fail to provide Mitacs with 
accurate, up-to-date, complete information in a timely fashion, resulting in support services not being 
arranged or my payments being delayed.
5. • 
Acknowledge that I am 18 years of age and, therefore, of the legal age in Canada required to provide 
the consent contained herein.


In [43]:

stop_words = set(stopwords.words('english'))

def preprocess(text):
  words = word_tokenize(text.lower())
  words = [word for word in words if word.isalpha() and word not in stop_words]
  return " ".join(words)



processed_sentences = [preprocess(sentence) for sentence in sentences ]

print("Processed Sentences :", processed_sentences)



Processed Sentences : ['mitacs terms use selecting yes section agree provide mitacs accurate complete detailed information', 'agree update data comply timelines requiring submission information reasonable input changes information receive timely way', 'recognize mitacs uses information provided arrange supports services internship dispense award payments', 'hold mitacs liable fail provide mitacs accurate complete information timely fashion resulting support services arranged payments delayed', 'acknowledge years age therefore legal age canada required provide consent contained herein']


In [44]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_sentences)
print (tfidf_matrix)


  (0, 26)	0.4131049253363838
  (0, 42)	0.3084199912964104
  (0, 47)	0.3084199912964104
  (0, 37)	0.3084199912964104
  (0, 51)	0.3084199912964104
  (0, 36)	0.3084199912964104
  (0, 3)	0.24883130700776374
  (0, 28)	0.2065524626681919
  (0, 0)	0.24883130700776374
  (0, 9)	0.24883130700776374
  (0, 15)	0.3084199912964104
  (0, 21)	0.1737584432460257
  (1, 3)	0.21900279936620143
  (1, 21)	0.30585850262969977
  (1, 46)	0.2714483249179972
  (1, 13)	0.2714483249179972
  (1, 10)	0.2714483249179972
  (1, 44)	0.2714483249179972
  (1, 34)	0.2714483249179972
  (1, 39)	0.2714483249179972
  (1, 30)	0.2714483249179972
  (1, 22)	0.2714483249179972
  (1, 8)	0.2714483249179972
  (1, 31)	0.2714483249179972
  (1, 45)	0.21900279936620143
  :	:
  (3, 0)	0.21706717170530437
  (3, 9)	0.21706717170530437
  (3, 21)	0.1515776060853735
  (3, 45)	0.21706717170530437
  (3, 38)	0.21706717170530437
  (3, 27)	0.21706717170530437
  (3, 20)	0.2690491643239955
  (3, 25)	0.2690491643239955
  (3, 17)	0.2690491643239955
  (3

In [45]:
sentence_importance = tfidf_matrix.sum(axis=1).A1
sentence_scores = list(zip(sentences,sentence_importance ))

sorted_sentence_scores = sorted(sentence_scores , key=lambda x: x[1],reverse= True)


print("sorted sentences by Importance :")
for sentence ,score in sorted_sentence_scores:
  print(f"{sentence} (Score: {score})")


sorted sentences by Importance :
I will not hold Mitacs liable should I fail to provide Mitacs with 
accurate, up-to-date, complete information in a timely fashion, resulting in support services not being 
arranged or my payments being delayed. (Score: 3.9298628621900455)
• 
Agree to update any data that is out-of-date, comply with all timelines requiring the submission of 
information where it is reasonable for me to do so, and input changes to information as I receive it, in 
a timely way. (Score: 3.729795675460072)
• 
Recognize that Mitacs uses the information provided to arrange supports and services for my internship 
and to dispense my award payments. (Score: 3.4184621252653455)
 
 
  
 
Mitacs’s Terms of Use 
 
 
By selecting "Yes" to this section, I: 
 
• 
Agree to provide Mitacs with accurate, complete and detailed information only. (Score: 3.390429700052355)
• 
Acknowledge that I am 18 years of age and, therefore, of the legal age in Canada required to provide 
the consent co

In [46]:
top_n = 2
summary_sentences = [sentence for sentence ,score in sorted_sentence_scores[:top_n]]

summary = " ".join(summary_sentences)
print("\n Summary:")
print(summary)

print ( len(summary))


 Summary:
I will not hold Mitacs liable should I fail to provide Mitacs with 
accurate, up-to-date, complete information in a timely fashion, resulting in support services not being 
arranged or my payments being delayed. • 
Agree to update any data that is out-of-date, comply with all timelines requiring the submission of 
information where it is reasonable for me to do so, and input changes to information as I receive it, in 
a timely way.
435


### we can verify the result by using "ROUGE"

In [47]:
pip install rouge-score



In [49]:
from rouge_score import rouge_scorer
reference_summary =article_text

system_summary = summary

scorer =rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'],use_stemmer=True)

scores =scorer.score(reference_summary,system_summary)

print("\n ROUGE Scores:")
print(f"ROUGE-1: {scores ['rouge1']}")
print(f"ROUGE-2: {scores ['rouge1']}")
print(f"ROUGE-L: {scores ['rouge1']}")




 ROUGE Scores:
ROUGE-1: Score(precision=1.0, recall=0.5314685314685315, fmeasure=0.6940639269406392)
ROUGE-2: Score(precision=1.0, recall=0.5314685314685315, fmeasure=0.6940639269406392)
ROUGE-L: Score(precision=1.0, recall=0.5314685314685315, fmeasure=0.6940639269406392)


ROUGE-1: Measures overlap of individual words.                 
ROUGE-2: Measures overlap of two-word sequences (bigrams).               
ROUGE-L: Measures longest common subsequence (LCS) overlap.          



**Precision =1.0 which is good :means the generetad text contains only words that exist in the original text .        


**Recal=0.53 which means that the summary is missing  of important information .It contains only 34% of the key words/phrases

**F1-Score=0.69: the summary misses details

#we increase recall by selection More sentences

In [50]:
top_n = 4
summary_sentences = [sentence for sentence ,score in sorted_sentence_scores[:top_n]]

summary = " ".join(summary_sentences)
print("\n Summary:")
print(summary)
print ( len(summary))


 Summary:
I will not hold Mitacs liable should I fail to provide Mitacs with 
accurate, up-to-date, complete information in a timely fashion, resulting in support services not being 
arranged or my payments being delayed. • 
Agree to update any data that is out-of-date, comply with all timelines requiring the submission of 
information where it is reasonable for me to do so, and input changes to information as I receive it, in 
a timely way. • 
Recognize that Mitacs uses the information provided to arrange supports and services for my internship 
and to dispense my award payments.  
 
  
 
Mitacs’s Terms of Use 
 
 
By selecting "Yes" to this section, I: 
 
• 
Agree to provide Mitacs with accurate, complete and detailed information only.
737


In [51]:
from rouge_score import rouge_scorer
reference_summary =article_text

system_summary = summary

scorer =rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'],use_stemmer=True)

scores =scorer.score(reference_summary,system_summary)

print("\n ROUGE Scores:")
print(f"ROUGE-1: {scores ['rouge1']}")
print(f"ROUGE-2: {scores ['rouge1']}")
print(f"ROUGE-L: {scores ['rouge1']}")




 ROUGE Scores:
ROUGE-1: Score(precision=1.0, recall=0.8391608391608392, fmeasure=0.9125475285171103)
ROUGE-2: Score(precision=1.0, recall=0.8391608391608392, fmeasure=0.9125475285171103)
ROUGE-L: Score(precision=1.0, recall=0.8391608391608392, fmeasure=0.9125475285171103)


##the recall =0.83 📈