In [2]:
import numpy as np
from sklearn.preprocessing import normalize

def pagerank(x, df=0.85, max_iter=30, bias=None):
    
    """
    Arguments
    ---------
    x : scipy.sparse.csr_matrix
        shape = (n vertex, n vertex)
    df : float
        Damping factor, 0 < df < 1
    max_iter : int
        Maximum number of iteration
    bias : numpy.ndarray or None
        If None, equal bias

    Returns
    -------
    R : numpy.ndarray
        PageRank vector. shape = (n vertex, 1)
    """

    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)

    # check bias
    if bias is None:
        bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
    else:
        bias = bias.reshape(-1,1)
        bias = A.shape[0] * bias / bias.sum()
        assert bias.shape[0] == A.shape[0]
        bias = (1 - df) * bias

    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R
    

In [3]:
from konlpy.tag import Komoran

komoran = Komoran()
def komoran_tokenizer(sent):
    words = komoran.pos(sent, join=True)
    words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
    return words

In [5]:
from collections import Counter
from scipy.sparse import csr_matrix
import numpy as np


def scan_vocabulary(sents, tokenize=None, min_count=2):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(str) returns list of str
    min_count : int
        Minumum term frequency

    Returns
    -------
    idx_to_vocab : list of str
        Vocabulary list
    vocab_to_idx : dict
        Vocabulary to index mapper.
    """
    counter = Counter(w for sent in sents for w in tokenize(sent))
    counter = {w:c for w,c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

def tokenize_sents(sents, tokenize):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(sent) returns list of str (word sequence)

    Returns
    -------
    tokenized sentence list : list of list of str
    """
    return [tokenize(sent) for sent in sents]

def vectorize(tokens, vocab_to_idx):
    """
    Arguments
    ---------
    tokens : list of list of str
        Tokenzed sentence list
    vocab_to_idx : dict
        Vocabulary to index mapper

    Returns
    -------
    sentence bow : scipy.sparse.csr_matrix
        shape = (n_sents, n_terms)
    """
    rows, cols, data = [], [], []
    for i, tokens_i in enumerate(tokens):
        for t, c in Counter(tokens_i).items():
            j = vocab_to_idx.get(t, -1)
            if j == -1:
                continue
            rows.append(i)
            cols.append(j)
            data.append(c)
    n_sents = len(tokens)
    n_terms = len(vocab_to_idx)
    x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms))
    return x


In [6]:
from collections import defaultdict
from scipy.sparse import csr_matrix

def word_graph(sents, tokenize=None, min_count=2, window=2,
    min_cooccurrence=2, vocab_to_idx=None, verbose=False):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(str) returns list of str
    min_count : int
        Minumum term frequency
    window : int
        Co-occurrence window size
    min_cooccurrence : int
        Minimum cooccurrence frequency
    vocab_to_idx : dict
        Vocabulary to index mapper.
        If None, this function scan vocabulary first.
    verbose : Boolean
        If True, verbose mode on

    Returns
    -------
    co-occurrence word graph : scipy.sparse.csr_matrix
    idx_to_vocab : list of str
        Word list corresponding row and column
    """
    if vocab_to_idx is None:
        idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
    else:
        idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]

    tokens = tokenize_sents(sents, tokenize)
    g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose)
    return g, idx_to_vocab

def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False):
    """
    Arguments
    ---------
    tokens : list of list of str
        Tokenized sentence list
    vocab_to_idx : dict
        Vocabulary to index mapper
    window : int
        Co-occurrence window size
    min_cooccurrence : int
        Minimum cooccurrence frequency
    verbose : Boolean
        If True, verbose mode on

    Returns
    -------
    co-occurrence matrix : scipy.sparse.csr_matrix
        shape = (n_vocabs, n_vocabs)
    """
    counter = defaultdict(int)
    for s, tokens_i in enumerate(tokens):
        if verbose and s % 1000 == 0:
            print('\rword cooccurrence counting {}'.format(s), end='')
        vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
        n = len(vocabs)
        for i, v in enumerate(vocabs):
            if window <= 0:
                b, e = 0, n
            else:
                b = max(0, i - window)
                e = min(i + window, n)
            for j in range(b, e):
                if i == j:
                    continue
                counter[(v, vocabs[j])] += 1
                counter[(vocabs[j], v)] += 1
    counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
    n_vocabs = len(vocab_to_idx)
    if verbose:
        print('\rword cooccurrence counting from {} sents was done'.format(s+1))
    return dict_to_mat(counter, n_vocabs, n_vocabs)

def dict_to_mat(d, n_rows, n_cols):
    """
    Arguments
    ---------
    d : dict
        key : (i,j) tuple
        value : float value

    Returns
    -------
    scipy.sparse.csr_matrix
    """
    rows, cols, data = [], [], []
    for (i, j), v in d.items():
        rows.append(i)
        cols.append(j)
        data.append(v)
    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))


In [7]:
from collections import Counter
import math
import numpy as np
import scipy as sp
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances


def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3,
    similarity=None, vocab_to_idx=None, verbose=False):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(sent) return list of str
    min_count : int
        Minimum term frequency
    min_sim : float
        Minimum similarity between sentences
    similarity : callable or str
        similarity(s1, s2) returns float
        s1 and s2 are list of str.
        available similarity = [callable, 'cosine', 'textrank']
    vocab_to_idx : dict
        Vocabulary to index mapper.
        If None, this function scan vocabulary first.
    verbose : Boolean
        If True, verbose mode on

    Returns
    -------
    sentence similarity graph : scipy.sparse.csr_matrix
        shape = (n sents, n sents)
    """

    if vocab_to_idx is None:
        idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
    else:
        idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]

    x = vectorize_sents(sents, tokenize, vocab_to_idx)
    if similarity == 'cosine':
        x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000)
    else:
        x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000)
    return x

def vectorize_sents(sents, tokenize, vocab_to_idx):
    rows, cols, data = [], [], []
    for i, sent in enumerate(sents):
        counter = Counter(tokenize(sent))
        for token, count in counter.items():
            j = vocab_to_idx.get(token, -1)
            if j == -1:
                continue
            rows.append(i)
            cols.append(j)
            data.append(count)
    n_rows = len(sents)
    n_cols = len(vocab_to_idx)
    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000):
    n_rows = x.shape[0]
    mat = []
    for bidx in range(math.ceil(n_rows / batch_size)):
        b = int(bidx * batch_size)
        e = min(n_rows, int((bidx+1) * batch_size))
        psim = 1 - pairwise_distances(x[b:e], x, metric='cosine')
        rows, cols = np.where(psim >= min_sim)
        data = psim[rows, cols]
        mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))
        if verbose:
            print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='')
    mat = sp.sparse.vstack(mat)
    if verbose:
        print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows))
    return mat

def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000):
    n_rows, n_cols = x.shape

    # Boolean matrix
    rows, cols = x.nonzero()
    data = np.ones(rows.shape[0])
    z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

    # Inverse sentence length
    size = np.asarray(x.sum(axis=1)).reshape(-1)
    size[np.where(size <= min_length)] = 10000
    size = np.log(size)

    mat = []
    for bidx in range(math.ceil(n_rows / batch_size)):

        # slicing
        b = int(bidx * batch_size)
        e = min(n_rows, int((bidx+1) * batch_size))

        # dot product
        inner = z[b:e,:] * z.transpose()

        # sentence len[i,j] = size[i] + size[j]
        norm = size[b:e].reshape(-1,1) + size.reshape(1,-1)
        norm = norm ** (-1)
        norm[np.where(norm == np.inf)] = 0

        # normalize
        sim = inner.multiply(norm).tocsr()
        rows, cols = (sim >= min_sim).nonzero()
        data = np.asarray(sim[rows, cols]).reshape(-1)

        # append
        mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))

        if verbose:
            print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='')

    mat = sp.sparse.vstack(mat)
    if verbose:
        print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows))

    return mat

def graph_with_python_sim(tokens, verbose, similarity, min_sim):
    if similarity == 'cosine':
        similarity = cosine_sent_sim
    elif callable(similarity):
        similarity = similarity
    else:
        similarity = textrank_sent_sim

    rows, cols, data = [], [], []
    n_sents = len(tokens)
    for i, tokens_i in enumerate(tokens):
        if verbose and i % 1000 == 0:
            print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='')
        for j, tokens_j in enumerate(tokens):
            if i >= j:
                continue
            sim = similarity(tokens_i, tokens_j)
            if sim < min_sim:
                continue
            rows.append(i)
            cols.append(j)
            data.append(sim)
    if verbose:
        print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents))
    return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))

def textrank_sent_sim(s1, s2):
    """
    Arguments
    ---------
    s1, s2 : list of str
        Tokenized sentences

    Returns
    -------
    Sentence similarity : float
        Non-negative number
    """
    n1 = len(s1)
    n2 = len(s2)
    if (n1 <= 1) or (n2 <= 1):
        return 0
    common = len(set(s1).intersection(set(s2)))
    base = math.log(n1) + math.log(n2)
    return common / base

def cosine_sent_sim(s1, s2):
    """
    Arguments
    ---------
    s1, s2 : list of str
        Tokenized sentences

    Returns
    -------
    Sentence similarity : float
        Non-negative number
    """
    if (not s1) or (not s2):
        return 0

    s1 = Counter(s1)
    s2 = Counter(s2)
    norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
    norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
    prod = 0
    for k, v in s1.items():
        prod += v * s2.get(k, 0)
    return prod / (norm1 * norm2)


In [8]:
import numpy as np


class KeywordSummarizer:
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        Tokenize function: tokenize(str) = list of str
    min_count : int
        Minumum frequency of words will be used to construct sentence graph
    window : int
        Word cooccurrence window size. Default is -1.
        '-1' means there is cooccurrence between two words if the words occur in a sentence
    min_cooccurrence : int
        Minimum cooccurrence frequency of two words
    vocab_to_idx : dict or None
        Vocabulary to index mapper
    df : float
        PageRank damping factor
    max_iter : int
        Number of PageRank iterations
    verbose : Boolean
        If True, it shows training progress
    """
    def __init__(self, sents=None, tokenize=None, min_count=2,
        window=-1, min_cooccurrence=2, vocab_to_idx=None,
        df=0.85, max_iter=30, verbose=False):

        self.tokenize = tokenize
        self.min_count = min_count
        self.window = window
        self.min_cooccurrence = min_cooccurrence
        self.vocab_to_idx = vocab_to_idx
        self.df = df
        self.max_iter = max_iter
        self.verbose = verbose

        if sents is not None:
            self.train_textrank(sents)

    def train_textrank(self, sents, bias=None):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        bias : None or numpy.ndarray
            PageRank bias term

        Returns
        -------
        None
        """

        g, self.idx_to_vocab = word_graph(sents,
            self.tokenize, self.min_count,self.window,
            self.min_cooccurrence, self.vocab_to_idx, self.verbose)
        self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
        if self.verbose:
            print('trained TextRank. n words = {}'.format(self.R.shape[0]))

    def keywords(self, topk=30):
        """
        Arguments
        ---------
        topk : int
            Number of keywords selected from TextRank

        Returns
        -------
        keywords : list of tuple
            Each tuple stands for (word, rank)
        """
        if not hasattr(self, 'R'):
            raise RuntimeError('Train textrank first or use summarize function')
        idxs = self.R.argsort()[-topk:]
        keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
        return keywords

    def summarize(self, sents, topk=30):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        topk : int
            Number of keywords selected from TextRank

        Returns
        -------
        keywords : list of tuple
            Each tuple stands for (word, rank)
        """
        self.train_textrank(sents)
        return self.keywords(topk)


class KeysentenceSummarizer:
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        Tokenize function: tokenize(str) = list of str
    min_count : int
        Minumum frequency of words will be used to construct sentence graph
    min_sim : float
        Minimum similarity between sentences in sentence graph
    similarity : str
        available similarity = ['cosine', 'textrank']
    vocab_to_idx : dict or None
        Vocabulary to index mapper
    df : float
        PageRank damping factor
    max_iter : int
        Number of PageRank iterations
    verbose : Boolean
        If True, it shows training progress
    """
    def __init__(self, sents=None, tokenize=None, min_count=2,
        min_sim=0.3, similarity=None, vocab_to_idx=None,
        df=0.85, max_iter=30, verbose=False):

        self.tokenize = tokenize
        self.min_count = min_count
        self.min_sim = min_sim
        self.similarity = similarity
        self.vocab_to_idx = vocab_to_idx
        self.df = df
        self.max_iter = max_iter
        self.verbose = verbose

        if sents is not None:
            self.train_textrank(sents)

    def train_textrank(self, sents, bias=None):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        bias : None or numpy.ndarray
            PageRank bias term
            Shape must be (n_sents,)

        Returns
        -------
        None
        """
        g = sent_graph(sents, self.tokenize, self.min_count,
            self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
        self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
        if self.verbose:
            print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))

    def summarize(self, sents, topk=30, bias=None):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        topk : int
            Number of key-sentences to be selected.
        bias : None or numpy.ndarray
            PageRank bias term
            Shape must be (n_sents,)

        Returns
        -------
        keysents : list of tuple
            Each tuple stands for (sentence index, rank, sentence)

        Usage
        -----
            >>> from textrank import KeysentenceSummarizer

            >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
            >>> keysents = summarizer.summarize(texts, topk=30)
        """
        n_sents = len(sents)
        if isinstance(bias, np.ndarray):
            if bias.shape != (n_sents,):
                raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
        elif bias is not None:
            raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))

        self.train_textrank(sents, bias)
        idxs = self.R.argsort()[-topk:]
        keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
        return keysents

In [10]:
def komoran_tokenize(sent):
    words = sent.split()
    words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
    return words

In [14]:
import pandas as pd

In [140]:
data = pd.read_csv('sample.csv')
data = data.fillna('')
data.head(3)

Unnamed: 0,mem_no,mem_sex,mate_conts,komoran
0,5,m,반갑습니다. 여행과 운동 그리고 음악을 좋아하는 남자입니다. 취미를 함께하며 평생...,"반갑/VA, 습니다/EF, ./SF, /NA, 여행/NNG, 과/JC, 운동/NNG..."
1,1407510,m,안녕하세요 회사다니며 추말엔 캠핑 다니고 있는 평범한 직장인이에요 처음엔 낯설고 잘...,"안녕하세요/IC, 회사/NNG, 다니/VV, 며/EC, 추/NNG, 말/NNG, 에..."
2,2,m,화려한 돌씽입니다. 맘 맞는분 만나서 남은 반생 같이하고 싶습니다. 여행가는것 좋...,"화려/XR, 하/XSA, ㄴ/ETM, 돌/VV, 씽/NNP, 이/VCP, ㅂ니다/E..."


In [19]:
texts = data['komoran'].tolist()
texts

['반갑/VA, 습니다/EF, ./SF, /NA, 여행/NNG, 과/JC, 운동/NNG, 그리고/MAJ, 음악/NNG, 을/JKO, 좋아하/VV, 는/ETM, 남자/NNG, 이/VCP, ㅂ니다/EF, ./SF, 취미/NNG, 를/JKO, 함께/MAG, 하/XSV, 며/EC, 평생/NNG, 인생/NNG, 의/JKG, 동반자/NNG, 로/JKB, 즐겁/VA, 게/EC, 살/VV, 고/EC, 싶/VX, 습니다/EF, ./SF, /NA, 좋/VA, 은/ETM, 인연/NNG, 이/JKS, 되/VV, 길/NNP, !/SF',
 '안녕하세요/IC, 회사/NNG, 다니/VV, 며/EC, 추/NNG, 말/NNG, 에/JKB, 는/JX, 캠핑/NNG, 다니/VV, 고/EC, 있/VV, 는/ETM, 평범/XR, 하/XSA, ㄴ/ETM, 직장인/NNG, 이/VCP, 에요/EC, 처음/NNG, 에/JKB, 는/JX, 낯설/VA, 고/EC, 잘/MAG, 모르/VV, 지만/EC, 서로/NNG, 에/JKB, 대하/VV, 아/EC, 알/VV, 아/EC, 가/VX, 면서/EC, 진지/XR, 하/XSA, ㄴ/ETM, 관계/NNG, 로/JKB, 발전/NNG, 하/XSV, 았/EP, 으면/EC, 하/VX, 아요/EC',
 '화려/XR, 하/XSA, ㄴ/ETM, 돌/VV, 씽/NNP, 이/VCP, ㅂ니다/EF, ./SF, 맘/NNG, 맞/VV, 는/ETM, 분/NNB, 만나/VV, 아서/EC, 남/VV, 은/ETM, 반생/NNG, 같이/MAG, 하/XSV, 고/EC, 싶/VX, 습니다/EF, ./SF, /NA, 여행가/NNG, 이/VCP, 는/ETM, 것/NNB, 좋아하/VV, ㅂ니다/EC',
 '반갑/VA, 습니다/EF, ./SF, 새롭/VA, ㄴ/ETM, 인연/NNG, 기다리/VV, ㅂ니다/EF, ./SF, 같/VA, 은/ETM, 생각/NNG, 으로/JKB, 같/VA, 은/ETM, 길/NNG, 을/JKO, 가/VV, 았/EP, 으면/EC, 하/VX, ㅂ니다/EF, ./SF',
 '여보/IC, 이

In [105]:
for cde in texts:
    f_lst = get_lst(cde)
    

['반갑/VA', '습니다/EF', './SF', '/NA', '여행/NNG', '과/JC', '운동/NNG', '그리고/MAJ', '음악/NNG', '을/JKO', '좋아하/VV', '는/ETM', '남자/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '취미/NNG', '를/JKO', '함께/MAG', '하/XSV', '며/EC', '평생/NNG', '인생/NNG', '의/JKG', '동반자/NNG', '로/JKB', '즐겁/VA', '게/EC', '살/VV', '고/EC', '싶/VX', '습니다/EF', './SF', '/NA', '좋/VA', '은/ETM', '인연/NNG', '이/JKS', '되/VV', '길/NNP', '!/SF']
['안녕하세요/IC', '회사/NNG', '다니/VV', '며/EC', '추/NNG', '말/NNG', '에/JKB', '는/JX', '캠핑/NNG', '다니/VV', '고/EC', '있/VV', '는/ETM', '평범/XR', '하/XSA', 'ㄴ/ETM', '직장인/NNG', '이/VCP', '에요/EC', '처음/NNG', '에/JKB', '는/JX', '낯설/VA', '고/EC', '잘/MAG', '모르/VV', '지만/EC', '서로/NNG', '에/JKB', '대하/VV', '아/EC', '알/VV', '아/EC', '가/VX', '면서/EC', '진지/XR', '하/XSA', 'ㄴ/ETM', '관계/NNG', '로/JKB', '발전/NNG', '하/XSV', '았/EP', '으면/EC', '하/VX', '아요/EC']
['화려/XR', '하/XSA', 'ㄴ/ETM', '돌/VV', '씽/NNP', '이/VCP', 'ㅂ니다/EF', './SF', '맘/NNG', '맞/VV', '는/ETM', '분/NNB', '만나/VV', '아서/EC', '남/VV', '은/ETM', '반생/NNG', '같이/MAG', '하/XSV', '고/EC', '싶/VX', '습니다/EF', './SF', '/NA', '여행

In [48]:
t1 = texts[1]
t1

'안녕하세요/IC, 회사/NNG, 다니/VV, 며/EC, 추/NNG, 말/NNG, 에/JKB, 는/JX, 캠핑/NNG, 다니/VV, 고/EC, 있/VV, 는/ETM, 평범/XR, 하/XSA, ㄴ/ETM, 직장인/NNG, 이/VCP, 에요/EC, 처음/NNG, 에/JKB, 는/JX, 낯설/VA, 고/EC, 잘/MAG, 모르/VV, 지만/EC, 서로/NNG, 에/JKB, 대하/VV, 아/EC, 알/VV, 아/EC, 가/VX, 면서/EC, 진지/XR, 하/XSA, ㄴ/ETM, 관계/NNG, 로/JKB, 발전/NNG, 하/XSV, 았/EP, 으면/EC, 하/VX, 아요/EC'

In [97]:
def get_lst(t1):
    final_lst = []
    t_lst = t1.split(',')
    for t in t_lst:
        t= t.strip()
        final_lst.append(t)

    return final_lst

In [99]:
abc = get_lst(t1)
abc

['안녕하세요/IC',
 '회사/NNG',
 '다니/VV',
 '며/EC',
 '추/NNG',
 '말/NNG',
 '에/JKB',
 '는/JX',
 '캠핑/NNG',
 '다니/VV',
 '고/EC',
 '있/VV',
 '는/ETM',
 '평범/XR',
 '하/XSA',
 'ㄴ/ETM',
 '직장인/NNG',
 '이/VCP',
 '에요/EC',
 '처음/NNG',
 '에/JKB',
 '는/JX',
 '낯설/VA',
 '고/EC',
 '잘/MAG',
 '모르/VV',
 '지만/EC',
 '서로/NNG',
 '에/JKB',
 '대하/VV',
 '아/EC',
 '알/VV',
 '아/EC',
 '가/VX',
 '면서/EC',
 '진지/XR',
 '하/XSA',
 'ㄴ/ETM',
 '관계/NNG',
 '로/JKB',
 '발전/NNG',
 '하/XSV',
 '았/EP',
 '으면/EC',
 '하/VX',
 '아요/EC']

In [116]:
for text in ggg:
    final_lst = get_lst(text)
    print(final_lst)

['반갑습니다.  여행과 운동 그리고 음악을 좋아하는 남자입니다. 취미를 함께하며 평생 인생의 동반자로 즐겁게 살고싶습니다.  좋은 인연이 되길!']
['안녕하세요 회사다니며 추말엔 캠핑 다니고 있는 평범한 직장인이에요 처음엔 낯설고 잘 모르지만 서로에 대해 알아가면서 진지한 관계로 발전했으면해요']
['화려한 돌씽입니다. 맘 맞는분 만나서 남은 반생 같이하고 싶습니다.  여행가는것 좋아합니다']
['반갑습니다. 새로운 인연 기다립니다. 같은 생각으로 같은 길을 갔으면 합니다.']
['여보야는 2015년에 런칭했으며 2022년도에는 4만쌍의 성혼을 이루도록 더 많은 홍보를 하겠습니다. 많은 관심부탁드립니다.']
['좋은사람 좋은인연 기다립니다.그럼 오늘하루도..즐겁고 행복한 하루 되세요~']
['진지한 만남을 원하며 천천히 한사람을 알아가는 걸 좋아합니다']
['세상의 흔한 만남보다. 세상에서 제일 아름다운 사랑을 하고 싶습니다. 조건따지는 분은 패스']
['']
['평범한 직장여성입니다. 워킹맘으로 앞만 보고 살아왔는데 이제는진정 서로 아끼는 평생의 반려자를 만나 노후는 행복하게 살고 싶습니다. 연하는 싫습니다']
['착하게 만나보고 싶어서 왔어요 착한분 연락되면 좋겠네요 프로필 보시고 쪽지주세요~ 파주운정문산쪽분 계시면 좋아요']
['늦은 나이까지 인연을 못 만나고 있네요^^;; 서로에게 힘이 될 수 있는 좋은 사람을 만나고 싶습니당~ 진지한 만남 원해요']
['매사 긍정적이고 배려심많고 심서미곱고 착한여자입니다']
['편하게 만날수 있는 인연을 찾고 싶어서 가입했어요 진지하게 만날수 있는 사람이면 좋겠어요']
['69~71년생 친구 같은 분', '별거중이고 이혼 소송중이신 분은 떳떳히 밝히시고 같은 처지 정리안되신 여자를 만나세요. 연애', '동거 즐기는 삶보다는 함께 늙어가며 서로를 책임지고 무덤까지 함께 갈수 있는 짝을 만나고 싶습니다.. 커리어우먼 보다는 한남자의 사랑 받는 아내로 살고 싶습니다.말많고 목소리 큰남자', '돌직구', '매사에 트집',

In [112]:
t2 = texts[2]
t2

'화려/XR, 하/XSA, ㄴ/ETM, 돌/VV, 씽/NNP, 이/VCP, ㅂ니다/EF, ./SF, 맘/NNG, 맞/VV, 는/ETM, 분/NNB, 만나/VV, 아서/EC, 남/VV, 은/ETM, 반생/NNG, 같이/MAG, 하/XSV, 고/EC, 싶/VX, 습니다/EF, ./SF, /NA, 여행가/NNG, 이/VCP, 는/ETM, 것/NNB, 좋아하/VV, ㅂ니다/EC'

In [None]:
get_lst()

In [119]:
g1 = ggg[1]

In [120]:
f_lst = get_lst(g1)
f_lst

['안녕하세요 회사다니며 추말엔 캠핑 다니고 있는 평범한 직장인이에요 처음엔 낯설고 잘 모르지만 서로에 대해 알아가면서 진지한 관계로 발전했으면해요']

In [122]:
g1

'안녕하세요 회사다니며 추말엔 캠핑 다니고 있는 평범한 직장인이에요 처음엔 낯설고 잘 모르지만 서로에 대해 알아가면서 진지한 관계로 발전했으면해요'

In [207]:
komoran = Komoran()
def komoran_tokenize(sent):
    words = komoran.pos(sent, join=True)
    words = [w for w in words if ('/NNG' in w and (len(w)>=2 or len(w)<4))]
    return words

In [208]:
data['textrank'] = data['mate_conts'].apply(komoran_tokenize)
data.head(3)

Unnamed: 0,mem_no,mem_sex,mate_conts,komoran,textrank
0,5,m,반갑습니다. 여행과 운동 그리고 음악을 좋아하는 남자입니다. 취미를 함께하며 평생...,"반갑/VA, 습니다/EF, ./SF, /NA, 여행/NNG, 과/JC, 운동/NNG...","[여행/NNG, 운동/NNG, 음악/NNG, 남자/NNG, 취미/NNG, 평생/NN..."
1,1407510,m,안녕하세요 회사다니며 추말엔 캠핑 다니고 있는 평범한 직장인이에요 처음엔 낯설고 잘...,"안녕하세요/IC, 회사/NNG, 다니/VV, 며/EC, 추/NNG, 말/NNG, 에...","[회사/NNG, 추/NNG, 말/NNG, 직장인/NNG, 처음/NNG, 서로/NNG..."
2,2,m,화려한 돌씽입니다. 맘 맞는분 만나서 남은 반생 같이하고 싶습니다. 여행가는것 좋...,"화려/XR, 하/XSA, ㄴ/ETM, 돌/VV, 씽/NNP, 이/VCP, ㅂ니다/E...","[맘/NNG, 반생/NNG, 여행/NNG]"


In [209]:
data['textrank']

0     [여행/NNG, 운동/NNG, 음악/NNG, 남자/NNG, 취미/NNG, 평생/NN...
1     [회사/NNG, 추/NNG, 말/NNG, 직장인/NNG, 처음/NNG, 서로/NNG...
2                               [맘/NNG, 반생/NNG, 여행/NNG]
3                                       [생각/NNG, 길/NNG]
4        [여보/NNG, 런/NNG, 쌍/NNG, 홍보/NNG, 관심/NNG, 부탁/NNG]
5                      [오늘/NNG, 하루/NNG, 행복/NNG, 하루/NNG]
6                                      [만남/NNG, 사람/NNG]
7      [세상/NNG, 만남/NNG, 세상/NNG, 제일/NNG, 사랑/NNG, 조건/NNG]
8                                                    []
9     [직장/NNG, 여성/NNG, 앞/NNG, 이제/NNG, 평생/NNG, 반려자/NN...
10                                    [연락/NNG, 프로필/NNG]
11              [나이/NNG, 인연/NNG, 서로/NNG, 힘/NNG, 사람/NNG]
12                      [매사/NNG, 긍정/NNG, 배려/NNG, 심/NNG]
13                             [인연/NNG, 가입/NNG, 사람/NNG]
14    [친구/NNG, 처지/NNG, 정리/NNG, 안/NNG, 여자/NNG, 삶/NNG,...
15      [오늘/NNG, 가입/NNG, 평생/NNG, 행복/NNG, 삶/NNG, 긍정/NNG]
16                     [거주/NNG, 박사/NNG, 스타/NNG, 부모/NNG]
17                                     [인연/NNG, 

In [149]:
t1 = data['textrank'].iloc[0]
t1

['반갑습니다/NNP',
 '여행/NNG',
 '운동/NNG',
 '음악/NNG',
 '좋아하/VV',
 '남자/NNG',
 '취미/NNG',
 '평생/NNG',
 '인생/NNG',
 '동반자/NNP',
 '즐겁/VA',
 '살/VV',
 '좋/VA',
 '인연/NNG',
 '되/VV']

In [166]:
keyword_extractor = KeywordSummarizer(
    tokenize = komoran_tokenize,
    window = -1,
    verbose = False
)

In [167]:
keywords = keyword_extractor.summarize(t1, topk=30)
for word, rank in keywords:
    print('{} ({:.3})'.format(word, rank))

VV/NNP (0.15)


In [176]:
counter = Counter(t1)
print(counter)

Counter({'반갑습니다/NNP': 1, '여행/NNG': 1, '운동/NNG': 1, '음악/NNG': 1, '좋아하/VV': 1, '남자/NNG': 1, '취미/NNG': 1, '평생/NNG': 1, '인생/NNG': 1, '동반자/NNP': 1, '즐겁/VA': 1, '살/VV': 1, '좋/VA': 1, '인연/NNG': 1, '되/VV': 1})


In [177]:
counter = {w:c for w,c in counter.items() if c >=1}
counter

{'반갑습니다/NNP': 1,
 '여행/NNG': 1,
 '운동/NNG': 1,
 '음악/NNG': 1,
 '좋아하/VV': 1,
 '남자/NNG': 1,
 '취미/NNG': 1,
 '평생/NNG': 1,
 '인생/NNG': 1,
 '동반자/NNP': 1,
 '즐겁/VA': 1,
 '살/VV': 1,
 '좋/VA': 1,
 '인연/NNG': 1,
 '되/VV': 1}