In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import statistics

from konlpy.tag import Twitter
from konlpy.utils import pprint
twitter = Twitter()
from collections import Counter

pd.set_option('max_colwidth', 500)

common_front = '../../dataset/go_0715/gogosing_' 
common_back = '.json'

def get_file(file_num):   
    file_path = common_front + str(file_num) + common_back
    json_data = open(file_path).read()
    data = json.loads(json_data)
    df_data = pd.DataFrame(data)
    return df_data


# 74번 파일이 에러가 나서 제외함
def concat_file(file_num):
    start_file = pd.DataFrame(get_file(1))
    for i in range(2,file_num+1):
        if( i == 74):
            continue
        df_tmp = pd.DataFrame(get_file(i))
        start_file = pd.concat([start_file, df_tmp])
    start_file = start_file.reset_index(drop=True)
    return start_file

In [13]:
go_data = concat_file(156)

In [14]:
len(go_data)

772829

In [15]:
 # 소수점 아래 3자리까지 나타내기, 4째자리에서 반올림
def short_float(val):
    value = float("{:.4f}".format(val))
    return value


In [16]:
def print_group(title, groups, used_func):
    pivot_num = 0.1
    save_list = list()

    pprint(title)
    cur_num = 0.0
    for i, group in enumerate(groups):
        save_list.append(used_func(group))
        print('group ' + str(i + 1) + ' = ' + str(float("{:.1f}".format(cur_num))) + str(' over ') + str(
            float("{:.1f}".format(cur_num + pivot_num))) + str(' under : '), save_list[i])
        cur_num += pivot_num
    print('\n')

    return save_list

In [17]:
import pandas as pd
import numpy as np
np.random.seed(0)
from konlpy.tag import Twitter
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

twitter=Twitter()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [18]:
# tokenizer : 문장에서 색인어 추출을 위해 정해진 품사의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["Noun","Alpha","Verb","Number","Adjective","KoreanParticle",
                        "Punctuation","Determiner", "Adverb", "Conjunction","Excalmation", "Foreign"], stopword=[]):
    return [
        word for word, tag in twitter.pos(
            raw, 
            norm=True,   # normalize 그랰ㅋㅋ -> 그래ㅋㅋ
            stem=True    # stemming 바뀌나->바뀌다
            )
        
          if len(word) > 1 and tag in pos and word not in stopword

        ]



In [19]:
vectorize = TfidfVectorizer(
    tokenizer=tokenizer,
    min_df=1,
    
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)

In [20]:
# 1.
#reviewer의 desc를 담는 리스트
def make_reviews(cid):
    desc=(go_data.loc[go_data['cId']==cid])['desc']
    desc=list(desc)
    
    return desc


In [26]:
#2. 리뷰들을 tf-idf 적용하여 벡터화
def tfidf(desc):
    X = vectorize.fit_transform(desc)
    print('fit_transform, (No.review {}, feature {})'.format(X.shape[0], X.shape[1]))
    features = vectorize.get_feature_names()
    
   # print (pd.DataFrame(data=X.toarray(), columns=features))
    vector_array=X.toarray()
    
    return X

In [22]:
#3. 유사도 구하기
def similarity(vector_arr,desc):   
    sm=[] #한 리뷰어의 리뷰 유사도를 담을 리스트
    max_cnt=0 #리뷰 유사도 최대값이 0.6 이상 count
    _sum=0
    _mid=0
    cnt=0
    for i in range(len(desc)-1):
        srch_vector= vectorize.transform([desc[i]])
        for j in range(i+1, len(desc)):
            cosine_similar =cosine_similarity(srch_vector, [vector_arr[j]]).flatten()
            cosine_similar=short_float(float(cosine_similar))
           # print ("cosine_similar ") + str(i+1) + str(' 번째 리뷰와') + str(j+1) + str(' 번째 리뷰 : ') + str(cosine_similar)
            sm.append(cosine_similar)


    #pprint(u'유사도 리스트'),;print (sm)

    sm=sm.sort()
    _min=sm[0]
    _max=sm[len(sm)-1]
    _sum=sm[0]
    
    for i in range(1,len(sm)):
        if(_min>sm[i]):
            _min=sm[i]
        if(_max<sm[i]):
            _max=sm[i]
        _sum+=sm[i]
        if(sm[i]==1):
            cnt+=1

    if(_max>=0.9):
        max_cnt+=1
        
    avg= short_float(_sum/len(sm))
    
    
    print ('min : ', _min)
    print('max : ', _max)
    print('avg : ', avg)
    print('num of 1 : ', cnt)
    print('# max_cnt : ', max_cnt)
    
    return _max


In [27]:
reviews=make_reviews('asdfg18**')
vector_array=tfidf(reviews)
print vector_array
#similarity(vector_array, reviews)

fit_transform, (No.review 59, feature 598)
  (0, 393)	0.17352469413085644
  (0, 575)	0.27166635180125503
  (0, 80)	0.2038538983652817
  (0, 504)	0.27166635180125503
  (0, 197)	0.27166635180125503
  (0, 84)	0.19433886207292184
  (0, 212)	0.27166635180125503
  (0, 71)	0.27166635180125503
  (0, 177)	0.21510779767807126
  (0, 455)	0.16106900382984712
  (0, 11)	0.12652640863694853
  (0, 375)	0.27166635180125503
  (0, 571)	0.07549921475897794
  (0, 36)	0.3150888704988498
  (0, 456)	0.22888145726582043
  (0, 107)	0.27166635180125503
  (0, 180)	0.17882633946474294
  (0, 52)	0.09862737594477176
  (0, 86)	0.11334342872585511
  (0, 594)	0.27166635180125503
  (1, 393)	0.2500544070436689
  (1, 52)	0.2982654270173768
  (1, 186)	0.2183939160793078
  (1, 273)	0.355414066116991
  (1, 139)	0.39147954620310094
  :	:
  (58, 591)	0.09757815277382803
  (58, 35)	0.1021098200799417
  (58, 0)	0.1126359692895425
  (58, 268)	0.13432089960736557
  (58, 469)	0.11887084491847022
  (58, 18)	0.16451494284012302
  (58

# condition 1 : reviewer Burstiness (RB) - focused on product

(한 제품에 일정 날에 리뷰를 많이남긴 리뷰어)

In [None]:
def burst_in_product(min_count, dataframe):
    condition1 = []
    product_burst = pd.DataFrame({'count' : dataframe.groupby( [ "pID", "rDate", "cId"] ).size()}).reset_index()
    suspicious_reviewers = (product_burst[product_burst['count'] >= min_count])['cId']
    for reviewer in suspicious_reviewers:
        condition1.append(reviewer)
    return remove_duplicate_from_list(condition1)

def remove_duplicate_from_list(input_list):
    tmp_set = set(input_list)
    result = list(tmp_set)
    return result

# condition 2 : reviewer who has a lot of reviews
(minimum리뷰수를 적절히 바꿔가면서 리뷰가 많은 리뷰어 찾기)

In [None]:
def find_reviewer_who_has_many_reviews(min_review, dataframe):
    condition2 = []
    reviewer_and_reviews = dataframe['cId'].value_counts()
    suspicious_reviewers = reviewer_and_reviews[reviewer_and_reviews >= min_review].index
    for reviewer in suspicious_reviewers:
        condition2.append(reviewer)
    return condition2

#.encode('ascii','ignore')

# condition 3 : 상호명(아뜨랑스, 고고싱, 스타일난다(난다))를 직접적으로 언급한 리뷰어

In [None]:
def reviewer_who_directly_write_sitename(min_count, dataframe):
    condition3 = [] 
    reviewer_who_write_sitename = dataframe[dataframe['desc'].str.contains(u"고고싱")]
    reviewer_and_reviews = reviewer_who_write_sitename['cId'].value_counts()
    suspicious_reviewers = reviewer_and_reviews[reviewer_and_reviews >= min_count].index
    for reviewer in suspicious_reviewers:
        condition3.append(reviewer)
        
    return condition3

# condition4 : 평균 rScore가 5에 가까운 리뷰어

In [None]:
def reviewer_who_has_high_rscore(min_rscore, dataframe):
    condition4 = [] 
    mean_rscore = dataframe.groupby(dataframe.cId).mean()['rScore']
    suspicious_reviewers = mean_rscore[mean_rscore >= min_rscore].index
    for reviewer in suspicious_reviewers:
        condition4.append(reviewer)
        
    return condition4

def intersect(lst1, lst2):
    return list(set(lst1) & set(lst2))

#spam_reviewers = (intersect(intersect(intersect(condition3, condition4), condition2), condition1))
#pprint(spam_reviewers)

# condition 5 : reviewer who has many reviews in A product over all time

In [None]:
def reviewer_who_has_many_reviews_in_a_product(min_count, dataframe):
    condition5 = []
    product_burst = pd.DataFrame({'count' : dataframe.groupby( [ "pID", "cId"] ).size()}).reset_index()
    suspicious_reviewers = (product_burst[product_burst['count'] >= min_count])['cId']
    for reviewer in suspicious_reviewers:
        condition5.append(reviewer)
    
    return remove_duplicate_from_list(condition5)

#reviewer_who_has_many_reviews_in_a_product(7)


# reviewer_who_satisfy_some_condition - intersect ver.

spammer cId리스트 반환

In [None]:
'''
각 bool 파라미터는 각 조건의 포함여부를 나타내며 실험32의 경우 모든 조건을 보므로 다 1임
cond1 : 한 제품 & 특정 날짜에 cond1개 초과로 남긴 리뷰어
cond2 : cond2개 초과의 리뷰를 가진 리뷰어
cond3 : 상호명 언급을 cond3번 초과한 리뷰어
cond4 : 평점 cond4초과 리뷰어
cond5 : 한제품에 cond5개 초과로 리뷰남긴 리뷰어 추가하기
'''
def reviewer_who_satisfy_some_condition(dataframe, cond1_bool, cond2_bool, cond3_bool, cond4_bool, cond5_bool,
                                        cond1, cond2, cond3, cond4, cond5):
    
    unique_cid = dataframe.cId.unique()
    #condition1
    if(cond1_bool == 1):
        reviewer1 = burst_in_product(cond1, dataframe)
    else:
        reviewer1 = unique_cid
    
    #condition2
    if(cond2_bool == 1):
        reviewer2 = find_reviewer_who_has_many_reviews(cond2, dataframe)
    else:
        reviewer2 = unique_cid
        
    #condition3
    if(cond3_bool == 1):
        reviewer3 = reviewer_who_directly_write_sitename(cond3, dataframe)
    else:
        reviewer3 = unique_cid
    
    #condition4
    if(cond4_bool == 1):
        reviewer4 = reviewer_who_has_high_rscore(cond4, dataframe)
    else:
        reviewer4 = unique_cid
        
    #condition5
    if(cond5_bool == 1):
        reviewer5 = reviewer_who_has_many_reviews_in_a_product(cond5, dataframe)
    else:
        reviewer5 = unique_cid
        
    spam_reviewers = intersect(intersect(intersect(intersect(reviewer1, reviewer2), reviewer3), reviewer4), reviewer5)
    
    #네이버 페이 구매자는 제외
    if(u'\ub124\uc774\ubc84 \ud398\uc774 \uad6c\ub9e4\uc790' in spam_reviewers):
        spam_reviewers.remove(u'\ub124\uc774\ubc84 \ud398\uc774 \uad6c\ub9e4\uc790')
    
    except_spam_in_gogosing = dataframe[~dataframe.cId.isin(spam_reviewers)]
    print("<intersect condition>")
    print("cond1 =" + str(cond1) + ", cond2 =" + str(cond2) + ", cond3 =" + str(cond3) + ", cond4 =" + str(cond4) + ", cond5 =" + str(cond5))
    print("the number of spam_reviewers : " + str(len(spam_reviewers)))
    print("suspicious spammers : "),
    print(spam_reviewers)
    print("the number of non spam reviewers : " + str(len(except_spam_in_gogosing.cId.unique())))
    rscore = except_spam_in_gogosing['rScore'].value_counts()
      
    #print("except spam in gogosing : " )
    #print(except_spam_in_gogosing)

    #어떤 평점의 리뷰가 전체 사라질 경우를 방지하기 위함
    if(not(5.0 in rscore.index)):
        add_row = pd.Series([1], index=[5.0])
        rscore = rscore.add(add_row)
    if(not(4.0 in rscore.index)):
        add_row = pd.Series([1], index=[4.0])
        rscore = rscore.add(add_row)
    if(not(3.0 in rscore.index)):
        add_row = pd.Series([1], index=[3.0])
        rscore = rscore.add(add_row)
    if(not(2.0 in rscore.index)):
        add_row = pd.Series([1], index=[2.0])
        rscore = rscore.add(add_row)
    if(not(1.0 in rscore.index)):
        add_row = pd.Series([1], index=[1.0])
        rscore = rscore.add(add_row)
        
    print("\n")
    print("rscore : ")
    print(rscore)        
    
    print ("ratio btw 5.0 and 4.0 : " + str(float(rscore[5.0])/float(rscore[4.0])))
    print ("ratio btw 5.0 and 3.0 : " + str(float(rscore[5.0])/float(rscore[3.0])))
    print ("ratio btw 5.0 and 2.0 : " + str(float(rscore[5.0])/float(rscore[2.0])))
    print ("ratio btw 5.0 and 1.0 : " + str(float(rscore[5.0])/float(rscore[1.0])))
    
    print ("the number of 5.0 reviews : " + str(rscore[5.0]))
    print ("the number of 4.0 reviews : " + str(rscore[4.0]))
    print ("the number of 3.0 reviews : " + str(rscore[3.0]))
    print ("the number of 2.0 reviews : " + str(rscore[2.0]))
    print ("the number of 1.0 reviews : " + str(rscore[1.0]))
                    
    score = ('5.0', '4.0', '3.0', '2.0', '1.0')
    number_of_reviews = [rscore[5.0],
                     rscore[4.0],
                     rscore[3.0],
                     rscore[2.0],
                     rscore[1.0]]

    plt.bar(score, number_of_reviews, color=['red'],
        width=0.3, alpha=0.5)
    plt.xticks(score, fontsize=15)
    plt.yticks(fontsize=15)
    plt.ylabel('# of review', fontsize=15)
    plt.rcParams["figure.figsize"] = (6,4)
    plt.xlabel('rScore', fontsize=15)
    ax = plt.gca()
    ax.set_ylim([0.0, except_spam_in_gogosing.shape[0]+500])
    plt.show()
    print("\n\n")
    return spam_reviewers

#  reviewer_who_satisfy_some_condition - union ver.

non spammer들의 cId리스트 반환

In [None]:
def reviewer_who_satisfy_some_condition_union(dataframe, cond1_bool, cond2_bool, cond3_bool, cond4_bool, cond5_bool,
                                        cond1, cond2, cond3, cond4, cond5):
    
     #condition1
    if(cond1_bool == 1):
        reviewer1 = burst_in_product(cond1, dataframe)
    else:
        reviewer1 = []
    
    #condition2
    if(cond2_bool == 1):
        reviewer2 = find_reviewer_who_has_many_reviews(cond2, dataframe)
    else:
        reviewer2 = []
        
    #condition3
    if(cond3_bool == 1):
        reviewer3 = reviewer_who_directly_write_sitename(cond3, dataframe)
    else:
        reviewer3 = []
    
    #condition4
    if(cond4_bool == 1):
        reviewer4 = reviewer_who_has_high_rscore(cond4, dataframe)
    else:
        reviewer4 = []
        
    #condition5
    if(cond5_bool == 1):
        reviewer5 = reviewer_who_has_many_reviews_in_a_product(cond5, dataframe)
    else:
        reviewer5 = []  
        
    spam_reviewers = remove_duplicate_from_list(reviewer1 + reviewer2 + reviewer3 + reviewer4 + reviewer5)
    
    
    #네이버 페이 구매자는 스패머에서 일단 제외
    if(u'\ub124\uc774\ubc84 \ud398\uc774 \uad6c\ub9e4\uc790' in spam_reviewers):
        spam_reviewers.remove(u'\ub124\uc774\ubc84 \ud398\uc774 \uad6c\ub9e4\uc790')
    
    except_spam_in_gogosing = dataframe[~dataframe.cId.isin(spam_reviewers)]
    print("total number of suspicious spam reviewers : " + str(len(spam_reviewers)))

    print("total number of expected non-spam reviewers : " + str(len(except_spam_in_gogosing.cId.unique()))) 
    
    print("<union condition>")
    print("cond1 =" + str(cond1) + ", cond2 =" + str(cond2) + ", cond3 =" + str(cond3) + ", cond4 =" + str(cond4) + ", cond5 =" + str(cond5))
    
    rscore = except_spam_in_gogosing['rScore'].value_counts()
     

    if(not(5.0 in rscore.index)):
        add_row = pd.Series([1], index=[5.0])
        rscore = rscore.add(add_row)
    if(not(4.0 in rscore.index)):
        add_row = pd.Series([1], index=[4.0])
        rscore = rscore.add(add_row)
    if(not(3.0 in rscore.index)):
        add_row = pd.Series([1], index=[3.0])
        rscore = rscore.add(add_row)
    if(not(2.0 in rscore.index)):
        add_row = pd.Series([1], index=[2.0])
        rscore = rscore.add(add_row)
    if(not(1.0 in rscore.index)):
        add_row = pd.Series([1], index=[1.0])
        rscore = rscore.add(add_row)  
    
    print ("ratio btw 5.0 and 4.0 : " + str(float(rscore[5.0])/float(rscore[4.0])))
    print ("ratio btw 5.0 and 3.0 : " + str(float(rscore[5.0])/float(rscore[3.0])))
    print ("ratio btw 5.0 and 2.0 : " + str(float(rscore[5.0])/float(rscore[2.0])))
    print ("ratio btw 5.0 and 1.0 : " + str(float(rscore[5.0])/float(rscore[1.0])))
    

    print ("the number of 5.0 reviews : " + str(rscore[5.0]))
    print ("the number of 4.0 reviews : " + str(rscore[4.0]))
    print ("the number of 3.0 reviews : " + str(rscore[3.0]))
    print ("the number of 2.0 reviews : " + str(rscore[2.0]))
    print ("the number of 1.0 reviews : " + str(rscore[1.0]))
    
                    
    score = ('5.0', '4.0', '3.0', '2.0', '1.0')
    number_of_reviews = [rscore[5.0],
                     rscore[4.0],
                     rscore[3.0],
                     rscore[2.0],
                     rscore[1.0]]

    plt.bar(score, number_of_reviews, color=['red'],
        width=0.3, alpha=0.5)
    plt.xticks(score, fontsize=15)
    plt.yticks(fontsize=15)
    plt.ylabel('# of review', fontsize=15)
    plt.rcParams["figure.figsize"] = (6,4)
    plt.xlabel('rScore', fontsize=15)
    ax = plt.gca()
    ax.set_ylim([0.0, except_spam_in_gogosing.shape[0]+1000])
    plt.show()
    print("\n\n")
    
    return except_spam_in_gogosing.cId.unique()

# intersect

#  union

In [None]:
non_spammer=reviewer_who_satisfy_some_condition_union(go_data, 1, 1, 1, 1, 1, 3, 50, 10, 4.95, 2)

In [None]:
non_spammer2=reviewer_who_satisfy_some_condition_union(go_data, 1, 1, 1, 1, 1, 3, 50, 10, 4.96, 3)

In [None]:
non_spammer3=reviewer_who_satisfy_some_condition_union(go_data, 1, 1, 1, 1, 1, 3, 50, 2, 4.96, 3)

In [None]:
df=pd.DataFrame(li, columns=['val'])
df

In [None]:
df.val.describe()

test np.percentile

In [None]:
li=[1,2,3,4,5,6,7,8,9,10]
li.sort(reverse=True)
li

In [None]:
np.percentile(li,25)

In [None]:
np.percentile(li,50)

In [None]:
np.percentile(li,75)

In [None]:
np.percentile(li,[25,50,75]) #제 1 사 분위수, 중앙값, 제 3사 분위수

# intersect

reviewer_who_satisfy_some_condition(go_data, 1, 1, 1, 1, 1, 3, 50, 10, 4.95, 2)

In [None]:
spammer=[u'asdfg18**', u'ghkal04**', u'rladmswl9708**', u'leeliast**', u'rhdms10**', u'sally70**', u'm0928hy**', u'yeji04**', u'sejin1**', u'wlgml13**', u'yjyim**', u'hooming**', u'sy2004**', u'ik35**', u'sosososo**', u'yeon33**', u'loveheart77**', u'wjswldms0**', u'ch**', u'jy03**', u'seoa40**', u'altnr**', u'acua**', u'wertyuio9**', u'chihn19**', u'dekuu12**']

In [None]:
len(spammer)

In [None]:
max_count=0
for i in range(len(spammer)):
    print str(i+1) + str('번째 리뷰어: ')+ str(spammer[i])
    reviews=make_reviews(spammer[i])
    if(len(reviews)<2):
        continue:
    vector_array=tfidf(reviews)
    max_cnt=similarity(vector_array, reviews)
    max_count+=max_cnt

print str('최대 유사도 0.6 이상인 사람 수: ') + str(max_count)

In [None]:
len(max_count)

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='chihn19**'])[['desc','rScore', 'rNo', 'rDate']])

In [None]:
(go_data.loc[go_data['cId']=='chihn19**'])['rScore'].mean()

In [None]:
statistics.median(li)

In [None]:
max = __builtins__.max
max(li)

In [None]:
min(li)

In [None]:
sum(li)

In [None]:
li=[3,1,6,2,9,5,0,5,7,4]
print("print list: ", li)
print("length list: ", len(li))
li.sort(reverse=True)
print("print sort list: ", li)

_max25=int(round(len(li)*0.25))
_max75=int(round(len(li)*0.75))
print ("max 25% order : " , _max25)
print("max 25% value of list: ", li[_max25-1])
print ("max 75% order: " , _max75)
print("max 75% value of list: ", li[_max75-1])

result_mid = print_want_val(li, lambda x: np.percentile(li,50)) # 중간값 반환 
print ("median: ", result_mid)
result_q1 = print_want_val( li, lambda x: np.percentile(li,25)) # 25%
print("1st quartile: ", result_q1)
result_q3 = print_want_val(li, lambda x: np.percentile(li,75)) #75%
print("3rd quartile: ", result_q3)                 


In [None]:
def make_groups(_list):
    group1=0
    group2=0
    group3=0
    group4=0
    group5=0
    group6=0
    group7=0
    group8=0
    group9=0
    group10=0
    
    for i in range(len(_list)):
        if(_list[i]<0.1):
            group1+=1
        elif(_list[i]>=0.1 and _list[i]< 0.2):
            group2+=1
        elif(_list[i] >=0.2 and _list[i]< 0.3):
            group3+=1
        elif(_list[i] >=0.3 and _list[i]< 0.4):
            group4+=1
        elif(_list[i]>=0.4 and _list[i]< 0.5):
            group5+=1
        elif(_list[i]>=0.5 and _list[i] < 0.6):
            group6+=1
        elif(_list[i]>=0.6 and _list[i] < 0.7):
            group7+=1
        elif(_list[i]>=0.7 and _list[i]< 0.8):
            group8+=1
        elif(_list[i]>=0.8 and _list[i]< 0.9):
            group9+=1
        elif(_list[i]>=0.9):
            group10+=1
        
    groups=[group1, group2, group3, group4, group5, group6,group7,group8,group9,group10 ]
    
    return groups

In [None]:
def print_want_val(sm_list,used_func):
    val=(used_func(sm_list))
   
    return val

In [None]:
#3. 유사도 구하기222222222
def similarity(vector_arr,desc):   
    sm=[] #한 리뷰어의 리뷰 유사도를 담을 리스트
    max_cnt=0 #리뷰 유사도 최대값이 0.9 이상 count
    _sum=0
    _mid=0
    cnt=0
    for i in range(len(desc)-1):
        srch_vector= vectorize.transform([desc[i]])
        for j in range(i+1, len(desc)):
            cosine_similar =cosine_similarity(srch_vector, [vector_arr[j]]).flatten()
            cosine_similar=short_float(float(cosine_similar))
           # print ("cosine_similar ") + str(i+1) + str(' 번째 리뷰와') + str(j+1) + str(' 번째 리뷰 : ') + str(cosine_similar)
            sm.append(cosine_similar)


   # pprint(u'유사도 리스트'),;print (sm)

    sm.sort(reverse=True)
    _min=min(sm)
    _mid=statistics.median(sm) # 유사도의 중간값
    _max=max(sm)
    _sum=sum(sm)

    if(_max>=0.9):
        max_cnt+=1
        
    avg= short_float(_sum/len(sm))
    
    
  #  print ('min : ', _min)
  #  print ('mid : ', _mid)
  #  print('max : ', _max)
    print('avg : ', avg)
  #  print('num of 1 : ', cnt)
    print('# similarity over 0.9 : ', max_cnt)
    
    return sm


    유사도 분포도 - 유사도 최대값 , 중간값, 상위 25%(q3), 상위 75%(q1)

In [None]:
max_list=[]
mid_list=[]
q1_list=[]
q3_list=[]

for i in range(len(spammer)):
    print str(i+1) + str('번째 리뷰어: ')+ str(spammer[i])
    reviews=make_reviews(spammer[i])
    if(len(reviews)<2):
        continue
    vector_array=tfidf(reviews)
    sm=similarity(vector_array, reviews)

  
    result_max= print_want_val(sm, lambda x: np.percentile(sm,100)) # 최대값 반환 
    print ("max: ", result_max)
    max_list.append(result_max)
    
    result_mid = print_want_val(sm, lambda x: np.percentile(sm,50)) # 중간값 반환 
    print ("median: ", result_mid)
    mid_list.append(result_mid)
    
    result_q1 = print_want_val(sm, lambda x: np.percentile(sm,25)) # 25%
    print("1st quartile: ", result_q1)
    q1_list.append(result_q1)
    
    result_q3 = print_want_val(sm, lambda x: np.percentile(sm,75)) #75%
    print("3rd quartile: ", result_q3)         
    q3_list.append(result_q3)


    print("\n")
    
groups_max=make_groups(max_list)
groups_mid=make_groups(mid_list)
groups_q1=make_groups(q1_list)
groups_q3=make_groups(q3_list)

In [None]:
x=np.arange(len(max_list))
y=sorted(max_list)

plt.plot(x,y,color='red',marker='o',linestyle='solid')
plt.title('max of similarity')
plt.xlabel("# of reviewer")
plt.ylabel("similarity")
plt.xticks(np.arange(len(x)),x,rotation=45)
plt.show()

In [None]:
x=np.arange(len(q3_list))
y=sorted(q3_list)

plt.plot(x,y,color='red',marker='o',linestyle='solid')
plt.title('3rd quartile of similarity')
plt.xlabel("# of reviewer")
plt.ylabel("similarity")
plt.xticks(np.arange(len(x)),x,rotation=45)
plt.show()

In [None]:
x=np.arange(len(mid_list))
y=sorted(mid_list)

plt.plot(x,y,color='red',marker='o',linestyle='solid')
plt.title('median of similarity')
plt.xlabel("# of reviewer")
plt.ylabel("median similarity")
plt.xticks(np.arange(len(x)),x,rotation=45)
plt.show()

In [None]:
x=np.arange(len(q1_list))
y=sorted(q1_list)

plt.plot(x,y,color='red',marker='o',linestyle='solid')
plt.title('1st quartile of similarity')
plt.xlabel("# of reviewer")
plt.ylabel("similarity")
plt.xticks(np.arange(len(x)),x,rotation=45)
plt.show()

In [None]:
groups_mid

In [None]:
result_len = print_group(u'해당 그룹별 spammer 수', groups_max, lambda x: x)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups_max, lambda x: float("{:.1f}".format(1.0 * (x) * 100 / len(spammer))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_len
labels=y

plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('max of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(3,-9), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('max of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,-7), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
result_len = print_group(u'해당 그룹별 spammer 수', groups_q3, lambda x: x)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups_q3, lambda x: float("{:.1f}".format(1.0 * (x) * 100 / len(spammer))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_len
labels=y

plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('3rd quartile of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(3,-9), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('3rd quartile of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,-7), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
result_len = print_group(u'해당 그룹별 spammer 수', groups_mid, lambda x: x)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups_mid, lambda x: float("{:.1f}".format(1.0 * (x) * 100 / len(spammer))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_len
labels=y

plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('median of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(3,-9), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('meidan of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,-7), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
result_len = print_group(u'해당 그룹별 spammer 수', groups_q1, lambda x: x)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups_q1, lambda x: float("{:.1f}".format(1.0 * (x) * 100 / len(spammer))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_len
labels=y

plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('1st quartile of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(3,-9), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('1st quartile of review similarity')
plt.xlabel("similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,-7), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

유사도 분포도 - 유사도 최대값

In [None]:
group1=[]
group2=[]
group3=[]
group4=[]
group5=[]
group6=[]
group7=[]
group8=[]
group9=[]
group10=[]
max_list=[]
_max=0

for i in range(len(spammer)):
    print str(i+1) + str('번째 리뷰어: ')+ str(spammer[i])
    reviews=make_reviews(spammer[i])
    if(len(reviews)<2):
        continue
    vector_array=tfidf(reviews)
    sm=similarity(vector_array, reviews)
    _max=max(sm)
    max_list.append(_max)
    
    if(_max<0.1):
        group1.append(spammer[i])
    elif(_max>=0.1 and _max< 0.2):
        group2.append(spammer[i])
    elif(_max>=0.2 and _max< 0.3):
        group3.append(spammer[i])
    elif(_max >=0.3 and _max < 0.4):
        group4.append(spammer[i])
    elif(_max>=0.4 and _max < 0.5):
        group5.append(spammer[i])
    elif(_max>=0.5 and _max < 0.6):
        group6.append(spammer[i])
    elif(_max>=0.6 and _max < 0.7):
        group7.append(spammer[i])
    elif(_max>=0.7 and _max < 0.8):
        group8.append(spammer[i])
    elif(_max>=0.8 and _max < 0.9):
        group9.append(spammer[i])
    elif(_max>=0.9):
        group10.append(spammer[i])

In [None]:
len(max_list)

In [None]:
x=np.arange(len(max_list))
y=sorted(max_list)

plt.plot(x,y,color='red',marker='o',linestyle='solid')
plt.title('review similarity')
plt.xlabel("# of reviewer")
plt.ylabel("max similarity")
plt.xticks(np.arange(len(x)),x,rotation=45)
plt.show()

In [None]:
groups=[group1, group2, group3, group4, group5, group6,group7,group8,group9,group10 ]
result_len = print_group(u'해당 그룹별 spammer 수', groups, len)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups, lambda x: float("{:.1f}".format(1.0 * len(x) * 100 / len(spammer))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_len
labels=y

plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('group of review similarity')
plt.xlabel("max similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(3,-9), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('ratio of review similarity')
plt.xlabel("max similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,-7), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
group10

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='wertyuio9**'])[['desc','rScore', 'pID','rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='seoa40**'])[['desc','rScore', 'pID','rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='asdfg18**'])[['desc','rScore', 'pID','rNo', 'rDate']])

In [None]:
(go_data.loc[go_data['cId']=='asdfg18**'])['rScore'].mean()

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='ghkal04**'])[['desc','rScore', 'pID','rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='leeliast**'])[['desc','rScore', 'rNo', 'rDate']]).head(20)

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='sally70**'])[['desc','rScore', 'pID','rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='yjyim**'])[['desc','rScore', 'rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='jy03**'])[['desc','rScore', 'rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='sejin1**'])[['desc','rScore', 'pID','rNo', 'rDate']])

In [None]:
pd.DataFrame((go_data.loc[go_data['cId']=='acua**'])[['desc','rScore', 'pID','rNo', 'rDate']])

reviewer_who_satisfy_some_condition(go_data, 1, 1, 1, 1, 1, 3, 50, 10, 4.96, 3)

In [None]:
spammer2= [u'ghkal04**', u'rladmswl9708**', u'leeliast**', u'rhdms10**', u'sally70**', u'm0928hy**', u'yeji04**', u'sejin1**', u'wlgml13**', u'asdfg18**', u'hooming**', u'sy2004**', u'ik35**', u'sosososo**', u'yeon33**', u'loveheart77**', u'wjswldms0**', u'ch**', u'jy03**', u'seoa40**', u'altnr**', u'acua**', u'chihn19**', u'dekuu12**']

In [None]:
len(spammer2)

In [None]:
group1=[]
group2=[]
group3=[]
group4=[]
group5=[]
group6=[]
group7=[]
group8=[]
group9=[]
group10=[]
max=0

for i in range(len(spammer2)):
    print str(i+1) + str('번째 리뷰어: ')+ str(spammer2[i])
    reviews=make_reviews(spammer2[i])
    if(len(reviews)<2):
        continue
    vector_array=tfidf(reviews)
    max=similarity(vector_array, reviews)
    
    if(max<0.1):
        group1.append(spammer2[i])
    elif(max>=0.1 and max< 0.2):
        group2.append(spammer2[i])
    elif(max>=0.2 and max< 0.3):
        group3.append(spammer2[i])
    elif(max >=0.3 and max < 0.4):
        group4.append(spammer2[i])
    elif(max>=0.4 and max < 0.5):
        group5.append(spammer2[i])
    elif(max>=0.5 and max < 0.6):
        group6.append(spammer2[i])
    elif(max>=0.6 and max < 0.7):
        group7.append(spammer2[i])
    elif(max>=0.7 and max < 0.8):
        group8.append(spammer2[i])
    elif(max>=0.8 and max < 0.9):
        group9.append(spammer2[i])
    elif(max>=0.9):
        group10.append(spammer2[i])

In [None]:
groups=[group1, group2, group3, group4, group5, group6,group7,group8,group9,group10 ]
result_len = print_group(u'해당 그룹별 spammer 수', groups, len)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups, lambda x: float("{:.1f}".format(1.0 * len(x) * 100 / len(spammer2))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_len
labels=y

    
plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('group of review similarity')
plt.xlabel("max similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(3,-9), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9 over']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('ratio of review similarity')
plt.xlabel("max similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,-7), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

reviewer_who_satisfy_some_condition(go_data, 1, 1, 1, 1, 1, 3, 50, 2, 4.96, 3)

In [None]:
spammer3= [u'ghkal04**', u'rladmswl9708**', u'wldwldg**', u'leeliast**', u'rhdms10**', u'sally70**', u'm0928hy**', u'yeji04**', u'gloryn**', u'sestt**', u'dorosy11**', u'wlgml13**', u'rhdecyli**', u'phr9101**', u'asdfg18**', u'diqkdldiqk**', u'hooming**', u'sy2004**', u'qpqlgi**', u'tmfrl123**', u'jhw20**', u'sejin1**', u'ngt**', u'sosososo**', u'yeon33**', u'chdms03**', u'kimminji**', u'loveheart77**', u'qapl44**', u'wjswldms0**', u'ch**', u'jy03**', u'seoa40**', u'altnr**', u'acua**', u'ik35**', u'ekdmsdl07**', u'redgirl4**', u'dev**', u'ans56**', u'chihn19**', u'syndrome12**', u'sjj44**', u'dekuu12**', u'jsh**', u'gmlwls20**']

In [None]:
len(spammer3)

In [None]:
for i in range(len(spammer3)):
    print str(i+1) + str('번째 리뷰어: ')+ str(spammer3[i])
    reviews=make_reviews(spammer3[i])
    vector_array=tfidf(reviews)
    similarity(vector_array, reviews)

In [None]:
group1=[]
group2=[]
group3=[]
group4=[]
group5=[]
group6=[]
group7=[]
group8=[]
group9=[]
group10=[]
max=0

for i in range(len(spammer3)):
    print str(i+1) + str('번째 리뷰어: ')+ str(spammer3[i])
    reviews=make_reviews(spammer3[i])
    if(len(reviews)<2):
        continue
    vector_array=tfidf(reviews)
    max=similarity(vector_array, reviews)
    
    if(max<0.1):
        group1.append(spammer3[i])
    elif(max>=0.1 and max< 0.2):
        group2.append(spammer3[i])
    elif(max>=0.2 and max< 0.3):
        group3.append(spammer3[i])
    elif(max >=0.3 and max < 0.4):
        group4.append(spammer3[i])
    elif(max>=0.4 and max < 0.5):
        group5.append(spammer3[i])
    elif(max>=0.5 and max < 0.6):
        group6.append(spammer3[i])
    elif(max>=0.6 and max < 0.7):
        group7.append(spammer3[i])
    elif(max>=0.7 and max < 0.8):
        group8.append(spammer3[i])
    elif(max>=0.8 and max < 0.9):
        group9.append(spammer3[i])
    elif(max>=0.9):
        group10.append(spammer3[i])

In [None]:
def print_group(title, groups, used_func):
    pivot_num = 0.1
    save_list = list()

    pprint(title)
    cur_num = 0.0
    for i, group in enumerate(groups):
        save_list.append(used_func(group))
        print('group ' + str(i + 1) + ' = ' + str(float("{:.1f}".format(cur_num))) + str(' over ') + str(
            float("{:.1f}".format(cur_num + pivot_num))) + str(' under : '), save_list[i])
        cur_num += pivot_num
    print('\n')

    return save_list


result_len = print_group(u'해당 그룹별 spammer 수', groups, len)
result_ratio = print_group(u'해당 그룹별 spammer 비율', groups, lambda x: float("{:.1f}".format(1.0 * len(x) * 100 / len(spammer3))))

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9~1.0']
y=result_len
labels=y
    
plt.plot(x,y,color='green',marker='o',linestyle='solid')
plt.title('group of review similarity')
plt.xlabel("max similarity")
plt.ylabel("number of reviewer")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,2), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

In [None]:
x=[u'0.1 under',u'0.1~0.2',u'0.2~0.3',u'0.3~0.4',u'0.4~0.5',u'0.5~0.6',u'0.6~0.7',u'0.7~0.8',u'0.8~0.9',u'0.9~1.0']
y=result_ratio
labels=y
    
plt.plot(x,y,color='blue',marker='o',linestyle='solid')
plt.title('ratio of review similarity')
plt.xlabel("max similarity")
plt.ylabel("number of reviewer (%)")
plt.xticks(np.arange(len(x)),x,rotation=45)
for label, x_count, y_count in zip(labels, x, y):
    plt.annotate(label,
                 xy=(x_count, y_count), #label을 데이터포인트에 두되
                 xytext=(4,4), # 약간 떨어져 있게
                 textcoords='offset points')
plt.show()

# Union

reviewer_who_satisfy_some_condition_union(go_data, 1, 1, 1, 1, 1, 3, 50, 10, 4.95, 2)

In [None]:
len(non_spammer)

In [None]:
non_spammer=list(non_spammer)

In [None]:
non_spammer[:5]

In [None]:
#네이버 페이 구매자 제외
if(u'\ub124\uc774\ubc84 \ud398\uc774 \uad6c\ub9e4\uc790' in non_spammer):
        non_spammer.remove(u'\ub124\uc774\ubc84 \ud398\uc774 \uad6c\ub9e4\uc790')

In [None]:
len(non_spammer)

In [None]:
max_count=0
one_reviewer=[]
for i in range(len(non_spammer)):
    print str(i+1) + str('번째 리뷰어: ')+ str(non_spammer[i].encode('utf-8'))

    reviews=make_reviews(non_spammer[i])
    if(len(reviews)<2):
        one_reviewer.append(non_spammer[i])
        continue
    vector_array=tfidf(reviews)
    max_cnt=similarity(vector_array, reviews)
    max_count+=max_cnt

print str('최대 유사도 0.9 이상인 사람 수: ') + str(max_count)

In [None]:
len(one_reviewer)

reviewer_who_satisfy_some_condition_union(go_data, 1, 1, 1, 1, 1, 3, 50, 10, 4.96, 3)

reviewer_who_satisfy_some_condition_union(go_data, 1, 1, 1, 1, 1, 3, 50, 2, 4.96, 3)

# 형태소

In [None]:
spammer_desc=[]
for i in range(len(spammer)):
    spammer_desc.append(make_reviews(spammer[i]))

In [None]:
len(spammer_desc)

In [None]:
pprint (spammer_desc[:5])

In [None]:
spammer_desc= [y for x in spammer_desc for y in x]
pprint(spammer_desc[:6])

In [None]:

split_n_desc=[]
for i in range(len(spammer_desc)):
    split_n_desc+= (twitter.pos(spammer_desc[i], norm=True, stem=True))
    
    
# pos tagging frequencies for split_n_desc
k_pos = []
k_tag_count = [] 
for i in split_n_desc:
    k_pos.append(i[1])

k_pos = list(set(k_pos))

k_data = dict(Counter(elem[1] for elem in split_n_desc))

n_table= pd.DataFrame(index = ['spammer리뷰 품사'],
                     columns = k_pos,
                     data = k_data)

n_table=n_table.T
n_table

In [None]:
n_table.sort_values(by='spammer리뷰 품사', ascending=False)