In [None]:
import os
import struct
from string import punctuation
from collections import defaultdict
from heapq import nlargest
import operator
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
STOP_WORDS = set(stopwords.words('english') + list(punctuation))

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(WordNetLemmatizer().lemmatize(item))

    stems = [word.lower().strip() for word in stems]
    return stems

def get_frequencies(text):
     
    tfidf = TfidfVectorizer(tokenize(text),STOP_WORDS)
    tfs = tfidf.fit_transform([text])

    freqs = {}
    feature_names = tfidf.get_feature_names()
    for col in tfs.nonzero()[1]:
        freqs[feature_names[col]] = tfs[0, col]

    return freqs

def generate_summary(text,top_n_sentences):
      
    freqs = get_frequencies(text)
    sentences = sent_tokenize(text)
    important_sentences = defaultdict(int)
    for i, sentence in enumerate(sentences):
        for token in word_tokenize(sentence.lower()):
            if token in freqs:
                important_sentences[i] += freqs[token]

        # Choose 20% of the text to show
    number_sentences = int(len(sentences) * 0.2)

        # Create an index with the most important sentences
    index_important_sentences = nlargest(number_sentences,
                                             important_sentences,
                                             important_sentences.get)

        # Sort frequencies
    sorted_freqs = sorted(freqs.items(), key=operator.itemgetter(1), reverse=True)

        # Create summary
    summarised_text = []
    ctr = 0
    for i in sorted(index_important_sentences):
        summarised_text.append(sentences[i])
        ctr = ctr + 1
        if top_n_sentences == ctr:
            break

    return summarised_text, sorted_freqs

In [None]:
email_1 = '''
Hi Nicole,

Thank you for keeping me updated on this issue. I'm happy to hear that the issue got resolved after all. 
You can now use the app in its full functionality again. 
Also many thanks for your suggestions. We hope to improve this feature in the future. 

In case you experience any further problems with the app, please don't hesitate to contact me again.

Best regards,

John D.
Customer Support

0000 Sunshine Parkway
Mountain View, CA
United States'''
email_2 = '''
Dear Jennifer,

Thanks for the advertisement you sent me about the Fashion Show. I think it will be a fantastic event and I want to be there, but before I come to Paris I need to know a little more about train timetables and the show. Could you give me more information?
I’m trying to decide which train to take. I think the best option is to take the last one from Berlin, but it doesn’t arrive in Paris until about ten o’clock. Will that be OK, or is it too late for you?
I’ve never been to a fashion show before. I’ve never been to Paris, either, so I need your advice. What kind of clothes do you think I should wear? And what’s the weather like at the moment? Is it warm or rather cold? Do I need to bring some warm clothes? What about rain? What are the weather forecasts? Do they say it is going to rain within the next 2 days or not? I don’t know if I am able to pack into my bag, it isn’t too big. Maybe I will take 2 of them.
By the way, what shall we do on Sunday? How about going for a walk in the park, or going on a river cruise? Or maybe you’ve got some other, better ideas? Tell me if something comes to your mind.
Anyway, I can’t wait to see you. I’m looking forward to hearing from you. Answer me as soon as it’s possible.

Cheers,
Meghan''' 

email_3 = '''
Hello Michael,

Thank you for your e mail I received a week ago. I’m sorry that I didn’t reply to your letter sooner, but I’m spending my time now in a summer house where Internet connection is very weak.
Anyway, it’s a great place to relax and take a break after a hard year in our school.
I don’t watch TV here nor do I use a mobile phone or a computer, so I have much more time than on my hands than I do usually. I feel healthier and I’m tanned. I swim a lot in a local lake, ride a bike and play football with local people. I’ve made many new friends.
I wish you were here. Would you like to come along the next year? I bet you’ll be satisfied.
Answer me.

Thanks, 
P. G.'''

In [None]:
#!pip3 install talon

In [None]:
from talon.signature.bruteforce import extract_signature

def preprocess(email):
    email, _ = extract_signature(email)
    
    lines = email.split('\n')

    for i in reversed(range(len(lines))):
        lines[i] = lines[i].strip()
        if lines[i] == '':
          lines.pop(i)
    email = '\n'.join(lines)
    email = extract_salutation(email)
    return email



In [None]:
input1=preprocess(email_1)
input2=preprocess(email_2)
input3=preprocess(email_3)
input= input1+input2+input3
summarised_text, sorted_freqs = generate_summary(input,5)
summarised_text

["In case you experience any further problems with the app, please don't hesitate to contact me again.Thanks for the advertisement you sent me about the Fashion Show.",
 'I think it will be a fantastic event and I want to be there, but before I come to Paris I need to know a little more about train timetables and the show.',
 'I think the best option is to take the last one from Berlin, but it doesn’t arrive in Paris until about ten o’clock.',
 'Do they say it is going to rain within the next 2 days or not?',
 'I’m sorry that I didn’t reply to your letter sooner, but I’m spending my time now in a summer house where Internet connection is very weak.']