In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from itertools import chain
import re
%matplotlib inline


In [2]:
df = pd.read_pickle('df_dataset.pkl')

In [3]:
corpus = " ".join(df.text.tolist())

In [4]:
corpus = re.sub(pattern='\n|\t|ㆍ|\r|\.|,|;|:|<|>|-|_|=|\+|/|\?|!|@|\(|\)|\*|&|！|\^|#|%|\$|~|？|\[|\]|\"|[0-9]', repl=" ", string=corpus)
corpus = re.sub(pattern='( )+', repl=' ', string=corpus)
corpus = re.sub(pattern='(ㅎ)+', repl='ㅎ', string=corpus)
corpus = re.sub(pattern='(ㅋ)+', repl='ㅋ', string=corpus)

In [5]:
corpus[:100]

'또 주문 합니다 잇몸에 자극이 없어서 좋아욤 흡수력은 좋은고 같구요 다른 비타민씨제품이 요즘 약간 속당김이 느껴지는거 같아 바꿔볼까 고민중입니다 이너로 입기에도 충전재가 약하네요 '

In [6]:
from collections import defaultdict
count= defaultdict(lambda: 0)

# for doc in docs:
for word in corpus.split():
    n = len(word)
    for e in range(1, n+1):
        count[word[:e]] += 1


In [7]:
def cohesion(w):
    try:
        c = pow(count[w]/(count[w[0]]), 1/(len(w)-1))
        return c
    except ZeroDivisionError:
        print(w)
        exit()

def ltokenize(w):
    n = len(w)
    if n <= 2: 
        return (w, '') 
    
    tokens = []
    for e in range(2, n+1):
        tokens.append((w[:e], w[e:], cohesion(w[:e])))
    tokens = sorted(tokens, key=lambda x:-x[2])
    return tokens[0][:2]

In [8]:
df = pd.DataFrame.from_dict(count, orient='index')
df = df.reset_index()
df.columns = ['word', 'cnt']
df.word = df.word.astype('str')

In [9]:
df.head()

Unnamed: 0,word,cnt
0,또,2939
1,주,15517
2,주문,9938
3,합,3219
4,합니,3069


In [10]:
# c = pow(count[w]/(count[w[0]]), 1/(len(w)-1))

def cohesion(row):
     if len(row['word']) != 1:
        return 1
        return pow(row['cnt']/(df.loc[df.word == row['word'][0], 'cnt']), 1/(len(row['word'])-1))
     else:
        return None

df['cohesion'] = df.apply(lambda row: cohesion(row), axis=1) 
# {k: cohesion(k) for k in count.keys() if len(k) != 1}

In [11]:
df.head()

Unnamed: 0,word,cnt,cohesion
0,또,2939,
1,주,15517,
2,주문,9938,1.0
3,합,3219,
4,합니,3069,1.0


In [14]:
ltokenize_score = {k: ltokenize(k) for k in cohesion_score.keys()}

In [17]:
df = pd.DataFrame.from_dict(cohesion_score, orient='index')

In [18]:
df.head()

Unnamed: 0,0
주문,0.640459
합니,0.953402
합니다,0.968757
잇몸,0.209945
잇몸에,0.246523


In [36]:
cohesion_score

{'또': ('또', ''),
 '주': ('주', ''),
 '주문': ('주문', ''),
 '합': ('합', ''),
 '합니': ('합니', ''),
 '합니다': ('합니', '다'),
 '잇': ('잇', ''),
 '잇몸': ('잇몸', ''),
 '잇몸에': ('잇몸', '에'),
 '자': ('자', ''),
 '자극': ('자극', ''),
 '자극이': ('자극', '이'),
 '없': ('없', ''),
 '없어': ('없어', ''),
 '없어서': ('없어', '서'),
 '좋': ('좋', ''),
 '좋아': ('좋아', ''),
 '좋아욤': ('좋아', '욤'),
 '흡': ('흡', ''),
 '흡수': ('흡수', ''),
 '흡수력': ('흡수', '력'),
 '흡수력은': ('흡수', '력은'),
 '좋은': ('좋은', ''),
 '좋은고': ('좋은', '고'),
 '같': ('같', ''),
 '같구': ('같구', ''),
 '같구요': ('같구', '요'),
 '다': ('다', ''),
 '다른': ('다른', ''),
 '비': ('비', ''),
 '비타': ('비타', ''),
 '비타민': ('비타', '민'),
 '비타민씨': ('비타', '민씨'),
 '비타민씨제': ('비타', '민씨제'),
 '비타민씨제품': ('비타', '민씨제품'),
 '비타민씨제품이': ('비타', '민씨제품이'),
 '요': ('요', ''),
 '요즘': ('요즘', ''),
 '약': ('약', ''),
 '약간': ('약간', ''),
 '속': ('속', ''),
 '속당': ('속당', ''),
 '속당김': ('속당김', ''),
 '속당김이': ('속당김', '이'),
 '느': ('느', ''),
 '느껴': ('느껴', ''),
 '느껴지': ('느껴', '지'),
 '느껴지는': ('느껴', '지는'),
 '느껴지는거': ('느껴', '지는거'),
 '같아': ('같아', ''),
 '바': ('바', 