In [1]:
import os
import re
import shutil

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [4]:
data_dir = r'D:\Downloads D Drive\kaggle project 7_bag of words_NLM'
#assert os.path.exists(data_dir)

nltk.download('stopwords', force=True)

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
# Read train data#
#shutil.unpack_archive(
#    os.path.join(data_dir, 'labeledTrainData.tsv.zip'), '.', 'zip'
#)
import csv
train_data = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=csv.QUOTE_NONE)

print('Train size:', len(train_data))
print('Columns of train data:', train_data.columns.tolist())
train_data.head(3)

Train size: 10842
Columns of train data: ['id', 'sentiment', 'review']


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [9]:
# Source: https://github.com/wendykan/DeepLearningMovies/blob/master/KaggleWord2VecUtility.py

def para_to_wordlist(para, remove_stopwords=False):
    text = BeautifulSoup(para).get_text()
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = list(filter(lambda w: w not in stops, words))
    return words

def para_to_sentences(para, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(para.decode('utf8').strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(
                para_to_wordlist(raw_sentence, remove_stopwords)
            )
    return sentences

In [10]:
train_data['clean_review'] = [
    ' '.join(para_to_wordlist(review, True))
    for review in tqdm(train_data.review)
]
train_data.head(3)
vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000)

train_features = vectorizer.fit_transform(
    train_data['clean_review'].tolist()
)
train_features = train_features.toarray()

type(train_features), train_features.dtype, train_features.shape
clf = RandomForestClassifier(n_estimators=100)
clf
clf.fit(train_features, train_data.sentiment)


100%|██████████| 10842/10842 [00:12<00:00, 902.44it/s] 


In [11]:
# Read test data
#shutil.unpack_archive(
 #   os.path.join(data_dir, 'testData.tsv.zip'), '.', 'zip'
#)
test_data = pd.read_csv('testData.tsv', sep='\t')

print('Test size:', len(test_data))
print('Columns of test data:', test_data.columns.tolist())
test_data.head(3)
test_data['clean_review'] = [
    ' '.join(para_to_wordlist(review, True))
    for review in tqdm(test_data.review)
]
test_data.head(3)

Test size: 25000
Columns of test data: ['id', 'review']


100%|██████████| 25000/25000 [00:16<00:00, 1506.27it/s]


Unnamed: 0,id,review,clean_review
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main themes mortality nostalgia...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kids saw tonight child loved one point k...


In [12]:
test_features = vectorizer.transform(
    test_data['clean_review'].tolist()
)
test_features = test_features.toarray()

type(test_features), test_features.dtype, test_features.shape
prediction = clf.predict(test_features)

prediction.shape

(25000,)

In [13]:
submission = pd.DataFrame({
    'id': test_data['id'],
    'sentiment': prediction,
})
submission.to_csv('submission.csv', index=False)