In [12]:
#FilterReview
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Twitter
import numpy as np
import pandas as pd
import pickle 
import os.path

'''
readCSV :CSV 파일을 읽는다. 고정포맷이 정해짐 
trainModel(ts =0.25): 입력된 모델에서 학습시킨다. ts 테스트 케이스 
predict(): 학습시킨 모델에 test데이터가 입력되면 예측한다. 
writePredictionResult(): 예측 결과를 파일로 출력한다. 
'''
twitter = Twitter()
def tw_tokenizer(text):
    # 입력 인자로 들어온 text 를 형태소 단어로 토큰화 하여 list 객체 반환
    tokens_ko = twitter.morphs(text)
    return tokens_ko

class FilterReviewSV:
    def __init__(self):
        self.model_lr_clf_name = 'model/model_FR_lr_clf.sav'
        self.model_tfidf_vect_name = 'model/model_FR_tfidf_vect.sav'
        
        self.isSetModel = False
        if not(self.isSetModel) and os.path.isfile(self.model_lr_clf_name) and os.path.isfile(self.model_tfidf_vect_name):
            print('모델을 불러오는중...')
            self.model_lr_clf = pickle.load(open(self.model_lr_clf_name, 'rb'))
            self.model_tfidf_vect = pickle.load(open(self.model_tfidf_vect_name, 'rb'))
            isSetModel = True
            print('모델 불러옴')
        else:
            print('모델이 없습니다..')
            

    def trainModel(self,filename,ts):
        data_df = pd.read_csv(filename,engine='python')
        X_train,X_test,y_train,y_test=train_test_split(data_df['sentence'],data_df['label'],test_size=0.25)
        self.model_tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
        self.model_tfidf_vect.fit(X_train)
        
        X_train_tfidf_vect = self.model_tfidf_vect.transform(X_train)
        X_test_tfidf_vect = self.model_tfidf_vect.transform(X_test)

        self.model_lr_clf = LogisticRegression(random_state=156)
        self.model_lr_clf.fit(X_train_tfidf_vect , y_train)
        pred = self.model_lr_clf.predict(X_test_tfidf_vect)
        print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

        pickle.dump(self.model_lr_clf, open(self.model_lr_clf_name, 'wb'))
        pickle.dump(self.model_tfidf_vect, open(self.model_tfidf_vect_name, 'wb'))
        isSetModel = True
    def readCSV(self,filename):
        self.data_df = pd.read_csv(filename,engine='python')
        self.test_df_tfidf_vect = self.model_tfidf_vect.transform(self.data_df['sentence'])
        
    def predict(self):
        self.y_pred = self.model_lr_clf.predict(self.test_df_tfidf_vect)
        self.df = pd.DataFrame({'pr_label':self.y_pred, 'sentence':self.data_df['sentence']})
        print('데이터 예측 완료')
        print(self.df[:5])
        return self.y_pred
    def writePredictionResult(self, filename):
        self.df.to_csv(filename,encoding='utf-8-sig',index=False)

In [None]:
import glob
import os

folderlist = glob.glob('./data/*')
for fol in folderlist:
    if os.path.isdir(fol):
        print('folder:',fol)
      
        filelist = glob.glob(fol+'/*.csv')
        for file in filelist:
            input_file = file.replace('\\','/')
            input_path = input_file[2:]
            print('input_file: ',input_path)
            print('\\n Readed\\n')
            

            fr = FilterReviewSV()
            fr.readCSV(filename=input_path)
            fr.predict()
            
            output_file = 'labeled_'+input_path[11:]
            print('output_file:', output_file)
            fr.writePredictionResult(output_file)
#print(fr.y_pred)

In [10]:
fr = FilterReviewSV()
fr.trainModel(filename='Data_ReviewFilter/TrainData.csv',ts=0.25)

모델을 불러오는중...
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.987


In [13]:
fr = FilterReviewSV()
fr.readCSV(filename='MovieReview/MR_test.csv')
fr.predict()

fr.writePredictionResult('sample.csv')
#print(fr.y_pred)

모델을 불러오는중...
모델 불러옴
데이터 예측 완료
   pr_label                                           sentence
0         1                                아 더빙.. 진짜 짜증나네요 목소리
1         1                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
2         1                                  너무재밓었다그래서보는것을추천한다
3         1                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
4         1  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...
