In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [None]:
# load stop words from english.txt
stopwords = []
with open('english.txt','r',encoding='utf8')as f:
    lines = f.readlines()
    for line in lines:
        stopwords.append(line.replace('\n', ''))
stopwords = set(stopwords)

In [None]:
# concat positive comments with negative ones
data_list = []
with open('imdb_train_pos.txt','r',encoding='utf8')as f:
    lines = f.readlines()
    for line in lines:
        line = re.sub("[^a-zA-Z]", " ",BeautifulSoup(line).get_text()).lower()
        data_list.append((line, 1))

with open('imdb_train_neg.txt','r',encoding='utf8')as f:
    lines = f.readlines()
    for line in lines:
        line = re.sub("[^a-zA-Z]", " ",BeautifulSoup(line).get_text()).lower()
        data_list.append((line, 0))


df = pd.DataFrame(data_list, columns=['text', 'label'])

In [None]:
# preprocess the text
df['words'] = df['text'].apply(lambda x: [item for item in x.split() if item not in stopwords])
df['processed_text'] = df['words'].apply(lambda x: ' '.join(x))

In [None]:
# extract features from long text
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
train_data_features = vectorizer.fit_transform(list(df['processed_text']))
train_data_features = train_data_features.toarray()
vocab = vectorizer.get_feature_names()

In [None]:
# train random forest model
from sklearn.ensemble import RandomForestClassifier
 
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(train_data_features, df["label"] )

In [None]:
# preprocess test data
test_data_list = []
with open('imdb_test_pos.txt','r',encoding='utf8')as f:
    lines = f.readlines()
    for line in lines:
        line = re.sub("[^a-zA-Z]", " ",BeautifulSoup(line).get_text()).lower()
        test_data_list.append((line, 1))

with open('imdb_test_neg.txt','r',encoding='utf8')as f:
    lines = f.readlines()
    for line in lines:
        line = re.sub("[^a-zA-Z]", " ",BeautifulSoup(line).get_text()).lower()
        test_data_list.append((line, 0))


test_df = pd.DataFrame(test_data_list, columns=['text', 'label'])
test_df['words'] = test_df['text'].apply(lambda x: [item for item in x.split() if item not in stopwords])
test_df['processed_text'] = test_df['words'].apply(lambda x: ' '.join(x))
test_data_features = vectorizer.fit_transform(list(test_df['processed_text']))
test_data_features = test_data_features.toarray()
test_df['predict'] = forest.predict(test_data_features)

In [None]:
# predict and print the performance
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
accuracy = accuracy_score(test_df['label'], test_df['predict'])
precision = precision_score(test_df['label'], test_df['predict'])
recall = recall_score(test_df['label'], test_df['predict'])
f_measure = f1_score(test_df['label'], test_df['predict'])
accuracy, precision, recall, f_measure