# 실습 [15-1]<br>
**실습명: TextRank을 이용한 추출 요약**<br>
: 추출 요약기법 TextRank 알고리즘을 활용해서 스코어 랭킹값을 이용해 요약 문장 추출

In [None]:
#Gensim 라이브러리 설치
from gensim.summarization.summarizer import summarize

text = '''Rice Pudding - Poem by Alan Alexander Milne
... What is the matter with Mary Jane?
... She's crying with all her might and main,
... And she won't eat her dinner - rice pudding again -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her dolls and a daisy-chain,
... And a book about animals - all in vain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well, and she hasn't a pain;
... But, look at her, now she's beginning again! -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her sweets and a ride in the train,
... And I've begged her to stop for a bit and explain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well and she hasn't a pain,
... And it's lovely rice pudding for dinner again!
... What is the matter with Mary Jane?'''

In [None]:
print(summarize(text)) #추출 요약 (통계적 특징을 고려해 추출 문장 정렬, 가장 스코어값이 두드러지는 문장 찾기, 새로운 문장 없음)

And she won't eat her dinner - rice pudding again -
I've promised her dolls and a daisy-chain,
I've promised her sweets and a ride in the train,
And it's lovely rice pudding for dinner again!


# 실습 [15-2]<br>
**실습명: 비지도 학습을 이용한 음식점 리뷰 추출 요약**<br>
: Kaggle의 [아마존 음식 리뷰 데이터](https://www.kaggle.com/snap/amazon-fine-food-reviews)를 활용해 추출 요약 진행 (기계학습 접근법 사용)<br>

1. K-means 클러스터링을 이용해 비지도 학습을 통해 클러스터 구성
2. 사전 학습된 워드 임베딩(glove)를 이용해 클러스터에 속해있는 문장들의 평균값 수치화
3. 요약문장과의 비교를 통해 어느 클러스터가 요약문장과 유사한지 계산

In [None]:
#관련 라이브러리 설치
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re #정규표현식

from nltk.tokenize import sent_tokenize #문장 토큰화
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import gensim.models.keyedvectors as word2vec
import gc
import string
import nltk
nltk.download('punkt')

from google.colab import files
uploaded = files.upload()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Saving Reviews.csv to Reviews.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['Reviews.csv'])) #Reviews csv 파일 읽기
df.head(3) 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [None]:
#각 리뷰별로 문장 단위로 분할
def split_sentences(reviews):
  n_reviews = len(reviews)
  for i in range(n_reviews): #리뷰 수만큼
    review = reviews[i] 
    sentences = sent_tokenize(review)
    for j in reversed(range(len(sentences))):
      sent = sentences[j]
      sentences[j] = sent.strip()
      if sent == '':
        setences.pop(j) #뒤에서부터 제거
    reviews[j] = sentences

rev_list = list(df['Text'])
split_sentences(rev_list) #리뷰 데이터 split하기
df['sent_tokens'] = rev_list #split된 리뷰 데이터 sent_tokens에 넣기

#각 리뷰별 문장 길이 계산 (5보다 긴 길이의 문장만 불러오기)
df['length_of_rv'] = df['sent_tokens'].map(lambda x: len(x))
choice_length = 5
df = df[df['length_of_rv'] > choice_length]
df.shape

(568453, 12)

In [None]:
#리뷰들의 최대 vocab 개수 5000개로 한정
list_sentences_train = df['Text']
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

- glove: 벡터 공간 상에서 단어의 의미 표현 나타냄
- fasttext: 단어를 더 작은 단위로 쪼개서 표현

In [None]:
def loadEmbeddingMatrix(typeToLoad):
        #사전 학습된 glove 임베딩 (glove=워드 임베딩 방법론)
        if(typeToLoad=="glove"):
            uploaded_glove = files.upload()
            EMBEDDING_FILE = io.BytesIO(uploaded['glove.twitter.27B.25d.txt'])
            EMBEDDING_FILE = 'glove.twitter.27B.25d.txt'
            embed_size = 25
        
        elif(typeToLoad=="fasttext"): #fasttext
            EMBEDDING_FILE='wiki.simple.vec/wiki.simple.vec'
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            #파일의 모든 line에 대해서 glove를 이용해서 수치화
            f = open(EMBEDDING_FILE, encoding='utf-8')
            for line in f:
                values = line.split()
                #1번째 인덱스는 단어
                word = values[0]
                #나머지 값들은 새로운 배열에 저장
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))

        gc.collect()
        return embeddings_index

In [None]:
#glove 로딩
emb_index = loadEmbeddingMatrix('glove')

In [None]:
#전체 문장들에 대해 단어들의 임베딩 평균값으로 문장 임베딩 계산

def calculate_sentence_embedding(wordList):
    #wordList = re.sub("[^\w]", " ",  sent).split()
    #print(wordList)
    emb_li =[]
    for k in wordList:
        embedding_vector = emb_index.get(k)
        if embedding_vector is not None:
            if(len(embedding_vector) == 25):
                emb_li.append(list(embedding_vector))
    #print("Lost words in translation: ", len(wordList)-len(emb_li))
    mean_arr = np.array(emb_li)
    #print("done calculating sentence emb for you")
    return np.mean(mean_arr, axis=0)

In [None]:
#미리 정의된 함수(calculate_sentence_embedding)를 이용해서 문장에 대해 임베딩 할당 -> 전처리 과정
def get_sent_embedding(mylist):
    sent_emb = []
    n_sentences = len(mylist)
    for i in mylist:
        #print("my sentence : ", i)
        #print("\nlower is", i.lower())
        i = i.lower()
        wL = re.sub("[^\w]", " ",  i).split()
        if(len(wL)>0):
            for k in wL:
                if(k in string.punctuation):
                    wL.remove(k)
            if(len(wL) <= 2):
                continue
        else:
            print("Sentence Removed: ",i)
            continue
        #print(wL)
        res = list(calculate_sentence_embedding(wL))
        sent_emb.append(res)
    return np.array(sent_emb)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
#5000건 리뷰의 요약문장과 비교해 어떤 클러스터가 요약문장과 가장 유사한지 확인

how_many_summaries = 5000
summary = [None]*how_many_summaries
for rv in range(how_many_summaries):
        review = df['sent_tokens'].iloc[rv]
        enc_email = get_sent_embedding(review)
        if(len(enc_email) > 0):
            n_clusters = int(np.ceil(len(enc_email)**0.5))
            kmeans = KMeans(n_clusters=n_clusters, random_state=0)
            kmeans = kmeans.fit(enc_email)
            avg = []
            closest = []
            for j in range(n_clusters):
                idx = np.where(kmeans.labels_ == j)[0]
                #print("IDX is: ", idx)
                avg.append(np.mean(idx))
            closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\
                                                       enc_email)
            ordering = sorted(range(n_clusters), key=lambda k: avg[k])
            summary[rv] = ' '.join([review[closest[idx]] for idx in ordering])
            print("Done for review # = ", rv)
        else:
            print("This is not a valid review")

In [None]:
df_5000 = df.iloc[:5000]
df_5000['PredictedSummary'] = summary
df_5000[['Text', 'PredictedSummary']].to_csv('top_5000_summary.csv')

In [None]:
df_result = pd.read_csv('top_5000_summary.csv')
df_result.head(10)