In [1]:
import shutil
import urllib.request
import os
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

In [2]:
def html_cleaner(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    text = re.sub(r'https*\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = BeautifulSoup(text,"lxml").get_text()
    
    return text

In [3]:
redownload = False
reextract = False

url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'

if(redownload == True and reextract == True):
    urllib.request.urlretrieve(url, filename)
    shutil.unpack_archive(filename, "")

In [4]:
path = "aclImdb/train/pos/"
temp = []
for file in os.listdir(path):
    with open(os.path.join(path + file), "r", encoding='utf-8') as f:
        temp.append(f.readlines())
train_pos = pd.DataFrame({"text": temp, "label": 1})

In [5]:
train_pos.head()

Unnamed: 0,text,label
0,[Bromwell High is a cartoon comedy. It ran at ...,1
1,[Homelessness (or Houselessness as George Carl...,1
2,[Brilliant over-acting by Lesley Ann Warren. B...,1
3,[This is easily the most underrated film inn t...,1
4,[This is not the typical Mel Brooks film. It w...,1


In [6]:
path = "aclImdb/train/neg/"
temp = []
for file in os.listdir(path):
    with open(os.path.join(path + file), "r", encoding='utf-8') as f:
        temp.append(f.readlines())
train_neg = pd.DataFrame({"text": temp, "label": 0})

In [7]:
train_neg.head()

Unnamed: 0,text,label
0,[Story of a man who has unnatural feelings for...,0
1,[Airport '77 starts as a brand new luxury 747 ...,0
2,[This film lacked something I couldn't put my ...,0
3,"[Sorry everyone,,, I know this is supposed to ...",0
4,[When I was little my parents took me along to...,0


In [8]:
path = "aclImdb/train/unsup/"
temp = []
for file in os.listdir(path):
    with open(os.path.join(path + file), "r", encoding='utf-8') as f:
        temp.append(f.readlines())
df_unlabeled = pd.DataFrame({"text": temp})

In [9]:
df_unlabeled.head()

Unnamed: 0,text
0,"[I admit, the great majority of films released..."
1,"[Take a low budget, inexperienced actors doubl..."
2,"[Everybody has seen 'Back To The Future,' righ..."
3,[Doris Day was an icon of beauty in singing an...
4,"[After a series of silly, fun-loving movies, 1..."


In [10]:
path = "aclImdb/test/pos/"
temp = []
for file in os.listdir(path):
    with open(os.path.join(path + file), "r", encoding='utf-8') as f:
        temp.append(f.readlines())
test_pos = pd.DataFrame({"text": temp, "label": 1})

In [11]:
test_pos.head()

Unnamed: 0,text,label
0,[I went and saw this movie last night after be...,1
1,[Actor turned director Bill Paxton follows up ...,1
2,[As a recreational golfer with some knowledge ...,1
3,"[I saw this film in a sneak preview, and it is...",1
4,[Bill Paxton has taken the true story of the 1...,1


In [12]:
path = "aclImdb/test/neg/"
temp = []
for file in os.listdir(path):
    with open(os.path.join(path + file), "r", encoding='utf-8') as f:
        temp.append(f.readlines())
test_neg = pd.DataFrame({"text": temp, "label": 0})

In [13]:
test_neg.head()

Unnamed: 0,text,label
0,[Once again Mr. Costner has dragged out a movi...,0
1,[This is an example of why the majority of act...,0
2,"[First of all I hate those moronic rappers, wh...",0
3,[Not even the Beatles could write songs everyo...,0
4,[Brass pictures (movies is not a fitting word ...,0


In [14]:
df_train = train_pos.append(train_neg)
df_test = test_pos.append(test_neg)

In [15]:
print(df_train.shape)
print(df_test.shape)
print(df_unlabeled.shape)

(25000, 2)
(25000, 2)
(50000, 1)


In [16]:
df_train['text'][:10]

0    [Bromwell High is a cartoon comedy. It ran at ...
1    [Homelessness (or Houselessness as George Carl...
2    [Brilliant over-acting by Lesley Ann Warren. B...
3    [This is easily the most underrated film inn t...
4    [This is not the typical Mel Brooks film. It w...
5    [This isn't the comedic Robin Williams, nor is...
6    [Yes its an art... to successfully make a slow...
7    [In this "critically acclaimed psychological t...
8    [THE NIGHT LISTENER (2006) **1/2 Robin William...
9    [You know, Robin Williams, God bless him, is c...
Name: text, dtype: object

In [17]:
df_test['text'][:10]

0    [I went and saw this movie last night after be...
1    [Actor turned director Bill Paxton follows up ...
2    [As a recreational golfer with some knowledge ...
3    [I saw this film in a sneak preview, and it is...
4    [Bill Paxton has taken the true story of the 1...
5    [I saw this film on September 1st, 2005 in Ind...
6    [Maybe I'm reading into this too much, but I w...
7    [I felt this film did have many good qualities...
8    [This movie is amazing because the fact that t...
9    ["Quitting" may be as much about exiting a pre...
Name: text, dtype: object

In [18]:
df_unlabeled['text'][:10]

0    [I admit, the great majority of films released...
1    [Take a low budget, inexperienced actors doubl...
2    [Everybody has seen 'Back To The Future,' righ...
3    [Doris Day was an icon of beauty in singing an...
4    [After a series of silly, fun-loving movies, 1...
5    [This isn't exactly a musical, but it almost s...
6    [After seven years and seventeen pictures at W...
7    [In the 1950's there were many film boigraphie...
8    [MY RATING- 7.3<br /><br />This one is a curio...
9    [Doris Day and James Cagney are excellent in t...
Name: text, dtype: object

In [19]:
df_train['text'] = df_train['text'].fillna('').apply(str)
df_test['text'] = df_test['text'].fillna('').apply(str)
df_unlabeled['text'] = df_unlabeled['text'].fillna('').apply(str)

df_train['text'] = df_train['text'].apply(lambda text: html_cleaner(text))
df_test['text'] = df_test['text'].apply(lambda text: html_cleaner(text))
df_unlabeled['text'] = df_unlabeled['text'].apply(lambda text: html_cleaner(text))

In [20]:
df_train['text'][:10]

0    bromwell high is a cartoon comedy it ran at th...
1    homelessness or houselessness as george carlin...
2    brilliant over acting by lesley ann warren bes...
3    this is easily the most underrated film inn th...
4    this is not the typical mel brooks film it was...
5    this isn t the comedic robin williams nor is i...
6    yes its an art to successfully make a slow pac...
7    in this critically acclaimed psychological thr...
8    the night listener 2006 1 2 robin williams ton...
9    you know robin williams god bless him is const...
Name: text, dtype: object

In [21]:
df_test['text'][:10]

0    i went and saw this movie last night after bei...
1    actor turned director bill paxton follows up h...
2    as a recreational golfer with some knowledge o...
3    i saw this film in a sneak preview and it is d...
4    bill paxton has taken the true story of the 19...
5    i saw this film on september 1st 2005 in india...
6    maybe i m reading into this too much but i won...
7    i felt this film did have many good qualities ...
8    this movie is amazing because the fact that th...
9    quitting may be as much about exiting a pre or...
Name: text, dtype: object

In [22]:
df_unlabeled['text'][:10]

0    i admit the great majority of films released b...
1    take a low budget inexperienced actors doublin...
2    everybody has seen back to the future right wh...
3    doris day was an icon of beauty in singing and...
4    after a series of silly fun loving movies 1955...
5    this isn t exactly a musical but it almost see...
6    after seven years and seventeen pictures at wa...
7    in the 1950 s there were many film boigraphies...
8    my rating 7 3this one is a curious mov made wh...
9    doris day and james cagney are excellent in th...
Name: text, dtype: object

In [23]:
df_train.to_csv('train.csv', index=False)
df_unlabeled.to_csv('unlabeled.csv', index=False)

df_test.to_csv('test.csv', index=False)

**REFS:**

https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
https://github.com/sgrvinod/Sentiment-Analysis-on-IMDb-Reviews/blob/master/IMDb-DV.py
https://www.tensorflow.org/hub/tutorials/tf2_text_classification
https://colab.research.google.com/github/tensorflow/datasets/blob/master/docs/overview.ipynb?hl=fr#scrollTo=FKouwN_yVSGQ
https://arxiv.org/ftp/arxiv/papers/1806/1806.06407.pdf
https://nminnie.github.io/pdf/Sentiment-analysis.pdf
https://github.com/AjeetSingh02/Overview_of_NLP/blob/master/sentimentClassification.ipynb
https://towardsdatascience.com/imdb-reviews-or-8143fe57c825

https://www.kaggle.com/onadegibert/sentiment-analysis-with-tfidf-and-random-forest

https://www.kaggle.com/kyen89/1-sentiment-analysis-tf-idf

https://medium.com/dataseries/sentiment-classifier-using-tfidf-3ffce3f1cbd5

https://gist.github.com/ameyavilankar/10347201