In [26]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from gensim.summarization.summarizer import summarize
from gensim.summarization.summarizer import summarize_corpus

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [7]:
# extract word embeddings from GLOVE dataset
word_embeddings = {}
f = open('glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

## Import Data

In [8]:
df = pd.read_csv("chat_summarization.csv")


## Tokenize Sentences

In [9]:
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [10]:
sentences 

['honestly, things aren’t going that great',
 'i have no idea how i’m doing so far.',
 'we’ve turned in four assignments so far, but i haven’t received feedback on ANY of them.',
 'it’s super frustrating.',
 'i might be doing great, but i might be failing.',
 'i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster',
 'the way that they encourage lots of class discussion is really nice',
 "i'd ask for more time from the TAs, or try to bring on some additional TAs so they can turn around assignments faster",
 "this week's class was really a drag",
 'live session was a huge waste of time',
 "the TA didn't spend enough time on the topics we need for the problem set.",
 "I have NO idea what's going on.",
 "Daniel's live sessions are engaging.",
 'i liked talking to my classmates in the breakout rooms, though they can sometimes be awkward...',
 'the TAs seem to really be overwhelmed',
 'office hours are really helpful and the TA

## Clean Text

In [11]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [12]:
stop_words = stopwords.words('english')

In [13]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

## Embeddings

In [14]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [15]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [16]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

## Generate TextRank Summary

In [17]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [18]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
# Extract top 5 sentences as the summary
for i in range(5):
    print(ranked_sentences[i][1])

I just don't like how long it takes to actually GET that feedback.
office hours are really helpful and the TAs really seem like they understand the material.
the live session was just a repeat of the async material, and didn't feel like it was worth the time.
the TA didn't spend enough time on the topics we need for the problem set.
i liked the way he applied last week's material to projects he had worked on in his day job.


## Generate Gensim Summary

In [22]:
s = ". "
text = s.join(sentences)
text

"honestly, things aren’t going that great. i have no idea how i’m doing so far.. we’ve turned in four assignments so far, but i haven’t received feedback on ANY of them.. it’s super frustrating.. i might be doing great, but i might be failing.. i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster. the way that they encourage lots of class discussion is really nice. i'd ask for more time from the TAs, or try to bring on some additional TAs so they can turn around assignments faster. this week's class was really a drag. live session was a huge waste of time. the TA didn't spend enough time on the topics we need for the problem set.. I have NO idea what's going on.. Daniel's live sessions are engaging.. i liked talking to my classmates in the breakout rooms, though they can sometimes be awkward.... the TAs seem to really be overwhelmed. office hours are really helpful and the TAs really seem like they understand the materia

In [24]:
print(summarize(text))

i’d ask for more time from the TAs, or try to bring  on some additional TAs so they can turn around assignments faster.
i'd ask for more time from the TAs, or try to bring on some additional TAs so they can turn around assignments faster.
live session was a huge waste of time.
the live session was just a repeat of the async material, and didn't feel like it was worth the time..


In [28]:
print(summarize_corpus(text, .2))

[['s'], [' '], ['i'], ['s'], [' '], [' '], ['i'], [' '], [' '], [' '], ['i'], [' '], [' '], [' '], ['i'], [' '], [' '], ['i'], [' '], ['i'], [' '], ['s'], [' '], [' '], [' '], [' '], ['i'], [' '], [' '], ['s'], ['s'], ['i'], ['s'], [' '], ['s'], [' '], [' '], [' '], ['i'], [' '], [' '], ['i'], [' '], [' '], [' '], [' '], [' '], [' '], ['i'], ['s'], [' '], ['s'], [' '], ['s'], ['i'], [' '], ['i'], [' '], ['i'], [' '], [' '], ['i'], [' '], [' '], [' '], ['i'], [' '], ['i'], [' '], [' '], ['i'], ['i'], [' '], ['i'], [' '], ['s'], [' '], [' '], [' '], ['i'], [' '], [' '], [' '], ['s'], [' '], [' '], [' '], [' '], ['i'], [' '], [' '], [' '], ['s'], [' '], ['i'], ['i'], [' '], ['s'], [' '], ['s'], [' '], [' '], [' '], [' '], [' '], ['s'], ['s'], ['i'], ['s'], [' '], ['s'], [' '], [' '], [' '], [' '], [' '], [' '], ['s'], [' '], [' '], ['s'], ['s'], [' '], ['i'], ['s'], ['s'], ['s'], ['i'], [' '], ['i'], ['s'], [' '], [' '], ['i'], [' '], ['i'], [' '], ['s'], [' '], [' '], [' '], ['i'], [' ']