In [None]:
from soynlp.word import WordExtractor

In [None]:
class Sentences:
    def __init__(self, fname):
        self.fname = fname
        self.length = 0
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                doc = doc.strip()
                if not doc:
                    continue
                for sent in doc.split('  '):
                    yield sent
    def __len__(self):
        if self.length == 0:
            with open(self.fname, encoding='utf-8') as f:
                for doc in f:
                    doc = doc.strip()
                    if not doc:
                        continue
                    self.length += len(doc.split('  '))
        return self.length

In [None]:
def read_data(filename):
    with open(filename, 'r' , encoding='utf-8-sig') as f:
        data = [line.split('$') for line in f.read().splitlines()]
    return data

In [None]:
def word_score(score):
    import math
    return (score.cohesion_forward * math.exp(score.right_branching_entropy))

In [None]:
from konlpy.tag import Twitter
twit = Twitter()

In [None]:
def tokenizer(text):
    return twit.morphs(text) 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
import re

In [None]:
def clean_text(text):
    cleaned_text = re.sub('[a-zA-Z]','', text)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!\-_+<>@\#$%&\\\=\(\'\"ㅋ\ㅜ\ㅠ\ㅎ]', '',
                    cleaned_text)
    return cleaned_text

In [None]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="Pi")

In [None]:
review_data = sc.textFile("/user/data/data_Review/택시운전사_cleand.txt").map(lambda line: line.split("$")) 

In [None]:
rating = list([x for x in review_data.map(lambda x:x[:1]).toLocalIterator()])

In [None]:
review = list([clean_text(str(x)).strip() for x in review_data.map(lambda x:x[2:-1]).toLocalIterator()])

In [None]:
data = [(rating[i][0], review[i]) for i in range(len(rating)) if len(review[i])  >= 1]

In [None]:
posData = [(x[1],1) for x in data if int(x[0]) >= 9]

In [None]:
negData = [(x[1],0) for x in data if int(x[0]) <= 4]

In [None]:
data = posData + negData

In [None]:
x = [x[0] for x in data]

In [None]:
y = [str(x[1]) for x in data]

In [None]:
train = sc.textFile("/user/data/Test_Train_set/train.csv")

In [None]:
header = train.first() #extract header
train = train.filter(lambda x: x != header)

In [None]:
x_train = list([x for x in train.map(lambda x:x[:-3]).toLocalIterator()])
y_train = list([x for x in train.map(lambda x:x[-1:]).toLocalIterator()])

In [None]:
vect_1_5 = TfidfVectorizer(ngram_range=(1,5), min_df = 3, tokenizer=tokenizer).fit(x_train)

In [None]:
nx = vect_1_5.transform(x_train)

In [None]:
lr_grid_5_3 = LogisticRegression(C=10.0, penalty='l2', random_state=10)

In [None]:
lr_grid_5_3.fit(nx,y_train)

In [None]:
X = vect_1_5.transform(x)

In [None]:
y_pred = lr_grid_5_3.predict(X)

In [None]:
pos_taxi_re = []
neg_taxi_re = []
for i in range(len(x)):
    if y_pred[i] == '1':
        pos_taxi_re.append(x[i])
    elif y_pred[i] == '0':
        neg_taxi_re.append(x[i])

In [None]:
import pandas as pd
pos_taxi_re_temp = pd.DataFrame(pos_taxi_re)
pos_taxi_re_temp.to_csv('/home/vagrant/BigdataProject/data/corpus/pos_taxi_re.csv', index=False, header=False, encoding = 'utf-8-sig')  
corpus_fname = '/home/vagrant/BigdataProject/data/corpus/pos_taxi_re.csv'
pos_sentences = Sentences(corpus_fname)

In [None]:
from soynlp.noun import LRNounExtractor

In [None]:
noun_extractor = LRNounExtractor(min_count=100)
noun_extractor.train(pos_sentences)
nouns = noun_extractor.extract()

In [None]:
def noun_score(score):    
    import math
    return score.score * score.known_r_ratio * math.log(score.frequency)

index = 0
posword = []
for noun, score in sorted(nouns.items(), key=lambda x:noun_score(x[1]), reverse=True):
    print()
    if noun == "진짜" or noun == "영화" or noun == "정말" or noun == "너무" or noun == "그냥" or noun == "생각": continue
    index += 1
    posword.append([noun,score.frequency])
    print(noun,score)
    if index == 5: break

In [None]:
#list to RDD
data = sc.parallelize(posword)

In [None]:
schema_data = data.map(
  lambda x: {'word': x[0], 'freq': x[1]})

In [None]:
import pymongo_spark
pymongo_spark.activate()

In [None]:
schema_data.saveToMongoDB('mongodb://localhost:27017/keyword.Dunkirk_Pos')

In [None]:
#부정키워드 
neg_taxi_re_temp = pd.DataFrame(neg_taxi_re)
neg_taxi_re_temp.to_csv('/home/vagrant/BigdataProject/data/corpus/neg_taxi_re.csv', index=False, header=False, encoding = 'utf-8-sig')  
corpus_fname = '/home/vagrant/BigdataProject/data/corpus/neg_taxi_re.csv'
neg_sentences = Sentences(corpus_fname)

In [None]:
word_extractor = WordExtractor(min_count= 60,
                               min_cohesion_forward=0.5, 
                               min_right_branching_entropy=0.3)

word_extractor.train(neg_sentences)
words = word_extractor.extract()


negword=[]
index = 0;
for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:20]:
    
    if word == "진짜" or word == "합니다" or word == "영화" or word == "정말" or word == "너무" or word == "그냥" or word == "생각" or word == "봤는데" or word =="평점": continue
    index += 1
    negword.append([word,score.leftside_frequency])
    print('%s     (%d, %.3f, %.3f)' % (word, 
                                   score.leftside_frequency, 
                                   score.cohesion_forward,
                                   score.right_branching_entropy
                                  ))
    if index == 5: break

In [None]:
data = sc.parallelize(negword)

In [None]:
schema_data = data.map(
  lambda x: {'word': x[0], 'freq': x[1]})

In [None]:
# 부정키워드 저장
schema_data.saveToMongoDB('mongodb://localhost:27017/keyword.Dunkirk_Neg')

In [None]:
#SUBKEYWORD 추출 함수
def ExtractSub(keyword):
    review = []
    for i in pos_taxi_re:
        if keyword in i:
            review.append(i) 
        
    역사1 = pd.DataFrame(review)
    역사1.to_csv('/home/vagrant/BigdataProject/data/temp/'+keyword+'.csv', index=False, header=False, encoding = 'utf-8-sig')  
    corpus_fname = '/home/vagrant/BigdataProject/data/temp/'+keyword+'.csv'
    sentences = Sentences(corpus_fname)
    
    word_extractor = WordExtractor(min_count=1,
                                   min_cohesion_forward=0, 
                                   min_right_branching_entropy=0.0)
    
    word_extractor.train(sentences)
    words = word_extractor.extract()
    
    
    subKeyword=[]
    for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:20]:
        if len(word) > 1 :
            if word == "생각": continue;
            #print('%s     (%d, %.3f, %.3f)' % (word, 
            #                               score.leftside_frequency, 
            #                               score.cohesion_forward,
            #                               score.right_branching_entropy
            #                              ))
            subKeyword.append([word,score.leftside_frequency])
    
    from konlpy.tag import Kkma
    kkma = Kkma()
    
    subKeyword_noun = [(kkma.nouns(row[0]), row[1]) for row in subKeyword]
    
    
    subKeyword_noun2 = []
    for i in range(len(subKeyword_noun)):
        for j in range(len(subKeyword_noun[i][0])):
            if len(subKeyword_noun[i][0][j]) >1:
                subKeyword_noun2.append([subKeyword_noun[i][0][j], subKeyword_noun[i][1]])
       
    subKeyword_noun_set = list(set([row[0] for row in subKeyword_noun2]))
    
    cnt = 0
    subKeyword_noun_res = []
    for i in subKeyword_noun_set:
        for j in range(len(subKeyword_noun2)):
            if i ==  str(subKeyword_noun2[j][0]):
                cnt += subKeyword_noun2[j][1]    
        subKeyword_noun_res.append([i, cnt, keyword])
        cnt= 0
                
    print(subKeyword_noun_res)
    return subKeyword_noun_res

In [None]:
# 긍정 SubKeyword 저장
subkeyword = []
for key in range(len(posword)):
    subkeyword.append(ExtractSub(posword[key][0]))

In [None]:
for i in range(0,5):
    subkeyword[i]
    data = sc.parallelize(subkeyword[i])
    schema_data = data.map(lambda x: {'word': x[0], 'freq': x[1], 'label': x[2]})
    schema_data.saveToMongoDB('mongodb://localhost:27017/keyword.Dunkirk_sub')

In [None]:
# 부정 SubKeyword 저장
subkeyword = []
for key in range(len(negword)):
    subkeyword.append(ExtractSub(negword[key][0]))

In [None]:
for i in range(0,5):
    subkeyword[i]
    data = sc.parallelize(subkeyword[i])
    schema_data = data.map(lambda x: {'word': x[0], 'freq': x[1], 'label': x[2]})
    schema_data.saveToMongoDB('mongodb://localhost:27017/keyword.Dunkirk_sub')