In [93]:
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk import WordNetLemmatizer
from PyPDF2 import PdfFileWriter

In [94]:
with open('cal_thres.txt', 'r') as file:
    doc = file.read()

#### Text preprocessing

1. Removal of special characters

In [95]:
doc = doc.replace('.', '. ')
tokenize_sent = sent_tokenize(doc)

In [96]:
def regex(doc):
    regex = r'[^a-zA-Z0-9\s]'
    text_ = [re.sub(regex, '', i) for i in tokenize_sent]
    return text_

In [97]:
def text_preprocessing(text):
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    tokens = [word.lower() for word in tokens]
    new_tokens = [i for i in tokens if i not in stopwords.words('english')]
    tokens = [word for word in new_tokens if len(word) >= 3]
    stemmer = PorterStemmer()

    tokens = [stemmer.stem(word) for word in tokens]
    
    tagged_corpus = pos_tag(tokens)

    Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    lemmatizer = WordNetLemmatizer()
    
    def prac_lemmatize(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')
    lemmatized_text = ' '.join([prac_lemmatize(token, tag) for token, tag in tagged_corpus])
    
    return lemmatized_text 

In [98]:
def summarize(text_):    
    final_text = np.array([text_preprocessing(i) for i in text_])
    vectorizer = TfidfVectorizer(strip_accents = 'unicode', norm = 'l2')

    matrix = vectorizer.fit_transform(final_text).todense()

    sent_score = matrix.sum(axis=1)

    sent_score_total = sent_score.sum()

    average = sent_score_total/sent_score.shape[0]

    summary = []

    for i in range(sent_score.shape[0]):
        if sent_score[i] >= average:
            summary.append(tokenize_sent[i])
    
    return ''.join(summary)

In [100]:
###Using the functions

In [101]:
text = regex(doc)

In [102]:
processed_text = [text_preprocessing(i) for i in text]

In [90]:
summarized_text = summarize(processed_text)

In [91]:
len(summarized_text)

2054

In [92]:
summarized_text

"Needful to say, i can be very pragmatic in my thinking, introvert, inspiring and convincing\n\n#thinking style: this result shows that i'm more pragmatic in my thinking, offering logical and approachable way to solve issues and view situations beyond theories.one of my approach to issues that defines me as an Advocate is concrete steps to implementing solutions, which as an optimizer, I adopt in my thinking,finding ways to be more productive, efficient and organized.#team role test: For every time I get the chance to participate in a team work, I tend to discover I am able to express a good interpersonal skills and some basic soft skills, yet learning.It also show that I can commuunicate effectively and resolve conflict with team mates\n\n#team role pie chart: this chart shows that I'm more of an executive and innovator characterised by my eagerness to get the work organized and done.Also as the creative generator of a team, strong imagination and desire to be original defines me as a

In [80]:
with open('sum2.txt', 'w') as file:
    file.write(summarized_text)