## Part 1

In [None]:
import re
from bs4 import BeautifulSoup
import pandas as pd       
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [None]:
train.shape # 학습용 데이터수와 차수 표시

In [None]:
train

In [None]:
print (train["review"][0])

In [None]:
# remove tag
rows = []
for t in train["review"]:
    soup = BeautifulSoup(t, "html.parser")
    for s in soup.select('br'):
        s.extract()
    rows.append(soup.get_text())
train["review"] = rows

In [None]:
example1 = train["review"][0]

In [None]:
letters_only = re.sub("[^a-zA-Z]",          
                      " ",                   
                      example1 ) 
print(letters_only) 

In [None]:
lower_case = letters_only.lower()       
print(lower_case)

## Part 2

In [None]:
words = lower_case.split()               
print(words)

### 불용어 제거

In [None]:
import nltk
nltk.download('stopwords')  
from nltk.corpus import stopwords

In [None]:
stopwords.words("english")

In [None]:
words = [w for w in words if not w in stopwords.words("english")]
print (words)

In [None]:
def review_to_words( raw_review ):
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    words = letters_only.lower().split()                             
    #
    stops = set(stopwords.words("english"))                  
    # 
    meaningful_words = [w for w in words if not w in stops]   
    #
    return( " ".join( meaningful_words ))   

In [None]:
clean_review = review_to_words( train["review"][0] )
print(clean_review)  

In [None]:
num_reviews = train["review"].size
num_reviews

In [None]:
clean_train_reviews = []
for i in range( 0, num_reviews ):
    clean_train_reviews.append( review_to_words( train["review"][i] ) )

print(clean_train_reviews)

In [None]:
# create bag of words with scikit-learn
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)  # set the features number to 5000

In [None]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)
print (train_data_features.shape)

In [None]:
vocab = vectorizer.get_feature_names()
print (vocab)

In [None]:
df = pd.DataFrame(train_data_features) 
# df.columns = vocab
# df.to_csv("train_bag_of_words.csv")
df

In [None]:
# Random Forest
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier
# Initialize
forest = RandomForestClassifier(n_estimators = 100) 
# Traing of Random Forest
forest = forest.fit( train_data_features, train["sentiment"] )

### Test

In [None]:
# Load "Test data"
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3) 

print(test.shape)

# Training "Test data"
clean_test_reviews = []
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

In [None]:
# Transform test data to word vector
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [None]:
# Predict test data with trained random forest model
result = forest.predict(test_data_features)

In [None]:
# Display predict result (convert thr result to csv file for submit to the Kaggle)
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

In [None]:
output