# 다음 tutorial을 참고해서 IMDB 무비 리뷰 

* https://towardsdatascience.com/sentiment-analysis-with-python-part-2-4f71e7bde59a

## 데이터 읽기

In [6]:
review_trains = []
review_tests = []
for line in open('./movie_data/full_train.txt', 'r'):
    review_trains.append(line.strip())
for line in open('./movie_data/full_test.txt', 'r'):
    review_tests.append(line.strip())
review_trains[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## 데이터 전처리

#### clutter token 제거 

In [7]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

review_trains_clean = preprocess_reviews(review_trains)    
review_tests_clean = preprocess_reviews(review_tests)
review_trains_clean[0]    

'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt'

#### stopwords 제거

In [54]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = stopwords.words('english')
print(f'a few example of stop words = {stop_words[10:20]}')

def remove_stop_words(corpus, stop_words):
    results = []
    for review in corpus:
        results.append(' '.join([word for word in review.split() if word not in stop_words]))
    return results

review_trains_no_stopword = remove_stop_words(review_trains_clean, stop_words)
review_tests_no_stopword = remove_stop_words(review_tests_clean, stop_words)
review_trains_no_stopword[0]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seonghoonjung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
a few example of stop words = ["you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


'bromwell high cartoon comedy ran time programs school life teachers 35 years teaching profession lead believe bromwell highs satire much closer reality teachers scramble survive financially insightful students see right pathetic teachers pomp pettiness whole situation remind schools knew students saw episode student repeatedly tried burn school immediately recalled high classic line inspector im sack one teachers student welcome bromwell high expect many adults age think bromwell high far fetched pity isnt'

#### Stemming (take times)

In [57]:
from tqdm import *
def get_stemmed(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    results = []
    for review in tqdm(corpus):
        results.append(' '.join([stemmer.stem(word) for word in review.split()]))
    return results     

review_trains_stemmed = get_stemmed(review_trains_no_stopword)
review_tests_stemmed = get_stemmed(review_tests_no_stopword)
review_trains_stemmed[0]

100%|██████████| 25000/25000 [01:10<00:00, 355.44it/s]
100%|██████████| 25000/25000 [01:08<00:00, 367.07it/s]


'bromwel high cartoon comedi ran time program school life teacher 35 year teach profess lead believ bromwel high satir much closer realiti teacher scrambl surviv financi insight student see right pathet teacher pomp petti whole situat remind school knew student saw episod student repeatedli tri burn school immedi recal high classic line inspector im sack one teacher student welcom bromwel high expect mani adult age think bromwel high far fetch piti isnt'

#### Lemmatization

* Lemmatization works by identifying the part-of-speech of a given word and then applying more complex rules to transform the word into its true root.

In [59]:
import nltk
nltk.download('wordnet')

def get_lemmatized(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    results = []
    for review in tqdm(corpus):
        results.append(' '.join([lemmatizer.lemmatize(word) for word in review.split()]))
    return results  

review_trains_lemma = get_lemmatized(review_trains_stemmed)
review_tests_lemma = get_lemmatized(review_tests_stemmed)
review_trains_lemma[0]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/seonghoonjung/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


100%|██████████| 25000/25000 [00:15<00:00, 1654.62it/s]
100%|██████████| 25000/25000 [00:13<00:00, 1874.85it/s]


'bromwel high cartoon comedi ran time program school life teacher 35 year teach profess lead believ bromwel high satir much closer realiti teacher scrambl surviv financi insight student see right pathet teacher pomp petti whole situat remind school knew student saw episod student repeatedli tri burn school immedi recal high classic line inspector im sack one teacher student welcom bromwel high expect mani adult age think bromwel high far fetch piti isnt'

## Case 1: One-hot encoding, 2-gram, Logistic Regression

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

######### n-gram vectorization
stop_words = ['in', 'of', 'at', 'a', 'the']
ngrame_vectorizer = CountVectorizer(binary=True, # count가 아닌 occurance or not
                                    stop_words=stop_words,
                                    ngram_range=[1,2]) # bi-gram까지 사용
ngrame_vectorizer.fit(review_trains_lemma)
train_features = ngrame_vectorizer.transform(review_trains_lemma)
test_features = ngrame_vectorizer.transform(review_tests_lemma)
print(f'voca size={len(ngrame_vectorizer.get_feature_names())}')
print(f'some voca={ngrame_vectorizer.get_feature_names()[:10]}')
print(train_features.toarray().shape)

########### learn by logistic regression

# label 생성 - 초반 절반은 1, 나머지는 0
labels = [1 if i < 12500 else 0 for i in range(25000)]  

# train/validation set 분리
X_train, X_val, y_train, y_val = train_test_split(train_features, labels, test_size=0.25, shuffle=True)

# Logistic regression으로 학습 (다양한 regularizer, small value mean strong regularization)
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# 최종 모델은 위에서 C=0.5 일 때를 사용
final_model = LogisticRegression(C=0.5)
final_model.fit(X_train, y_train)
print ("Accuracy = %s" % (accuracy_score(y_val, final_model.predict(X_val))))
final_model.coef_[0].shape


######## XAI
feature_to_weight_map = {
    word: weight for word, weight in zip(ngrame_vectorizer.get_feature_names(), final_model.coef_[0])
}

print('most influential words for positive sentiments')
print(sorted(feature_to_weight_map.items(), key=lambda x:x[1], reverse=True)[:5])
print('most influential words for negative sentiments')
print(sorted(feature_to_weight_map.items(), key=lambda x:x[1], reverse=False)[:5])


voca size=1636674
some voca=['00', '00 10', '00 1991', '00 doc', '00 either', '00 far', '00 howev', '00 keoni', '00 seem', '00 seen']
(25000, 1636674)
Accuracy for C=0.01: 0.87968
Accuracy for C=0.05: 0.88448
Accuracy for C=0.25: 0.8856
Accuracy for C=0.5: 0.88432
Accuracy for C=1: 0.88544
Accuracy = 0.88432
most influential words for positive sentiments
[('excel', 1.3943030663296845), ('perfect', 1.2132720628578177), ('must see', 0.9594619622451207), ('favorit', 0.91086838943874), ('superb', 0.8974273984498831)]
most influential words for negative sentiments
[('worst', -1.8627433646673672), ('aw', -1.5114004350012715), ('wast', -1.3488624224513295), ('bore', -1.3252523670630436), ('disappoint', -1.1635156302010716)]


## Case 2: TF-IDF, Logistic Regression

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

######### tf-idf vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(review_trains_lemma)
train_features = tfidf_vectorizer.transform(review_trains_lemma)
test_features = tfidf_vectorizer.transform(review_tests_lemma)
print(f'voca size={len(tfidf_vectorizer.get_feature_names())}')
print(f'some voca={tfidf_vectorizer.get_feature_names()[:10]}')
print(train_features.toarray().shape)

########### learn by logistic regression

# label 생성 - 초반 절반은 1, 나머지는 0
labels = [1 if i < 12500 else 0 for i in range(25000)]  

# train/validation set 분리
X_train, X_val, y_train, y_val = train_test_split(train_features, labels, test_size=0.25, shuffle=True)

# Logistic regression으로 학습 (다양한 regularizer, small value mean strong regularization)
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# 최종 모델은 위에서 C=1 일 때를 사용
final_model = LogisticRegression(C=1)
final_model.fit(X_train, y_train)
print ("Accuracy = %s" % (accuracy_score(y_val, final_model.predict(X_val))))
final_model.coef_[0].shape


######## XAI
feature_to_weight_map = {
    word: weight for word, weight in zip(tfidf_vectorizer.get_feature_names(), final_model.coef_[0])
}

print('most influential words for positive sentiments')
print(sorted(feature_to_weight_map.items(), key=lambda x:x[1], reverse=True)[:5])
print('most influential words for negative sentiments')
print(sorted(feature_to_weight_map.items(), key=lambda x:x[1], reverse=False)[:5])


voca size=65101
some voca=['00', '000', '0000000000001', '000001', '0001', '00015', '001', '002', '003830', '006']
(25000, 65101)
Accuracy for C=0.01: 0.82864
Accuracy for C=0.05: 0.8528
Accuracy for C=0.25: 0.8776
Accuracy for C=0.5: 0.88288
Accuracy for C=1: 0.88832
Accuracy = 0.88832
most influential words for positive sentiments
[('great', 6.607355574976493), ('excel', 5.778575171737329), ('love', 4.910504257864102), ('enjoy', 4.627837667692939), ('best', 4.587195439977589)]
most influential words for negative sentiments
[('worst', -8.019276948167178), ('bad', -7.305807963722026), ('wast', -6.438358493101818), ('aw', -5.963015244318026), ('bore', -5.760169345750094)]
