In [0]:
!cp ./drive/My\ Drive/Colab\ Notebooks/class-MachineLearning/11/movie_review.zip ./
!unzip ./movie_review.zip
!rm -rf ./movie_review/Icon ./movie_review/pos/Icon ./movie_review/neg/Icon

## 1. Load data from files

In [0]:
from sklearn.datasets import load_files

DATA_PATH = './movie_review'

data = load_files(DATA_PATH)
X_raw, Y_raw = data.data, data.target

## 2. Data preprocessing

### 2-1. Text preprocessing

In [0]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

docs = []
stemmer = WordNetLemmatizer()

# text preprocssing
for doc_raw in X_raw:
  # byte to string
  doc = doc_raw.decode('utf-8')
  
  # remove all the special characters
  doc = re.sub(r'\W', ' ', doc)

  # remove all single characters
  doc = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc)
  doc = re.sub(r'\^[a-zA-Z]\s+', ' ', doc)

  # substitute multiple spaces with single space
  doc = re.sub(r'\s+', ' ', doc, flags=re.I)

  # convert to lowercase
  doc = doc.lower()

  # lemmatization
  word_list = doc.split()
  lemma_word_list = []
  for word, tag in pos_tag(word_list):
    main_tag = tag[0].lower()
    lemma_word = ''

    if main_tag in ['a', 'r', 'n', 'v']:
      lemma_word = stemmer.lemmatize(word, main_tag)
    else:
      lemma_word = word

    lemma_word_list.append(lemma_word)

  doc = ' '.join(lemma_word_list)
  docs.append(doc)

### 2-2. Documents to vectors

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

# documents to tfidf vector
vectorizer = CountVectorizer(max_features=1600, min_df=0.01, max_df=0.7, stop_words=stopwords.words('english'))
X_data = vectorizer.fit_transform(docs).toarray()
tfidf_transformer = TfidfTransformer(sublinear_tf=False)
X_data = tfidf_transformer.fit_transform(X_data).toarray()

Y_data = Y_raw

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.3, shuffle=False)