In [46]:
import re
import pandas as pd
import nltk
from itertools import islice
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load into environment as Pandas DF

In [47]:
reviews_as_table = pd.read_csv('IMDB Dataset.csv')
reviews_as_table.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocessing

In [48]:
def foo(x):
    x = re.sub(r'[^\x00-\x7f]', r'', x) # remove unwanted ascii
    x = x.lower() # set to lower
    x = "<s> " + x # add start token at start

    x = " ".join(x.split()) # remove consecutive spaces

    END_TOKENS = [".", "!", "?"]
    for char in END_TOKENS:
        x = x.replace(char + " ", char + " <s>") # add start token at the end of every end token
    x = x.replace("<br /><br />", "<br /><br /><s>") # add start token at the end of double line breaks
    x = x.replace("<s><br /><br /><s>", "<br /><br /><s>") # remove start tokens at the start of double line breaks
    x = x.replace("<s> <br /><br /><s>", "<br /><br /><s>") # remove unlikely case just for safety
    x = x.replace("<br />", " LINE_BREAK ").replace("<s>", " START_TOKEN ")

    return x

reviews_as_table["cleaned"] = reviews_as_table["review"].apply(foo)

reviews_as_table["cleaned"][1]

' START_TOKEN  a wonderful little production.  LINE_BREAK  LINE_BREAK  START_TOKEN the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece.  LINE_BREAK  LINE_BREAK  START_TOKEN the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too!  START_TOKEN you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece.  START_TOKEN a masterful production about one of the great master\'s of comedy and his life.  LINE_BREAK  LINE_BREAK  START_TOKEN the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears.  START_TOKEN it plays on our knowledge and our senses, particularly with the scenes concerning ort

### Tokenization

In [49]:
reviews_as_table["tokenized"] = reviews_as_table["cleaned"].apply(lambda x: word_tokenize(x))
reviews_as_table["tokenized"][0]

['START_TOKEN',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'you',
 "'ll",
 'be',
 'hooked',
 '.',
 'START_TOKEN',
 'they',
 'are',
 'right',
 ',',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 '.',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 ',',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 '.',
 'START_TOKEN',
 'trust',
 'me',
 ',',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 '.',
 'START_TOKEN',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 ',',
 'sex',
 'or',
 'violence',
 '.',
 'START_TOKEN',
 'its',
 'is',
 'hardcore',
 ',',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 '.',
 'L

### obtaining `max_length`

In [50]:
counts = reviews_as_table["tokenized"].apply(lambda x: len(x))
max_length = counts.max()
max_length

2917

In [51]:
def foo(x):
    for _ in range(max_length - len(x)):
        x.append("NULL_TOKEN")
    return x

reviews_as_table["tokenized"] = reviews_as_table["tokenized"].apply(foo)
reviews_as_table["tokenized"][0]

['START_TOKEN',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'you',
 "'ll",
 'be',
 'hooked',
 '.',
 'START_TOKEN',
 'they',
 'are',
 'right',
 ',',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 '.',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 ',',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 '.',
 'START_TOKEN',
 'trust',
 'me',
 ',',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 '.',
 'START_TOKEN',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 ',',
 'sex',
 'or',
 'violence',
 '.',
 'START_TOKEN',
 'its',
 'is',
 'hardcore',
 ',',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 '.',
 'L

In [52]:
len(reviews_as_table["tokenized"][0])

2917

In [70]:
# curious to see what the longest review is, and if the start tokens were implemented properly

for index, row in reviews_as_table.iterrows():
    if 'NULL_TOKEN' not in row['tokenized']:
        longest = row['tokenized']
longest

['START_TOKEN',
 'match',
 '1',
 ':',
 'tag',
 'team',
 'table',
 'match',
 'bubba',
 'ray',
 'and',
 'spike',
 'dudley',
 'vs',
 'eddie',
 'guerrero',
 'and',
 'chris',
 'benoit',
 'bubba',
 'ray',
 'and',
 'spike',
 'dudley',
 'started',
 'things',
 'off',
 'with',
 'a',
 'tag',
 'team',
 'table',
 'match',
 'against',
 'eddie',
 'guerrero',
 'and',
 'chris',
 'benoit',
 '.',
 'START_TOKEN',
 'according',
 'to',
 'the',
 'rules',
 'of',
 'the',
 'match',
 ',',
 'both',
 'opponents',
 'have',
 'to',
 'go',
 'through',
 'tables',
 'in',
 'order',
 'to',
 'get',
 'the',
 'win',
 '.',
 'START_TOKEN',
 'benoit',
 'and',
 'guerrero',
 'heated',
 'up',
 'early',
 'on',
 'by',
 'taking',
 'turns',
 'hammering',
 'first',
 'spike',
 'and',
 'then',
 'bubba',
 'ray',
 '.',
 'START_TOKEN',
 'a',
 'german',
 'suplex',
 'by',
 'benoit',
 'to',
 'bubba',
 'took',
 'the',
 'wind',
 'out',
 'of',
 'the',
 'dudley',
 'brother',
 '.',
 'START_TOKEN',
 'spike',
 'tried',
 'to',
 'help',
 'his',
 'broth

In [71]:
len(longest)

2917

In [72]:
longest[1000:]

['match',
 '.',
 'START_TOKEN',
 'winner',
 ':',
 'john',
 'cena',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'match',
 '5',
 ':',
 'intercontinental',
 'championship',
 'rvd',
 'vs',
 'brock',
 'lesnar',
 'via',
 'disqualification',
 'the',
 'next',
 'big',
 'thing',
 'and',
 'mr.',
 'START_TOKEN',
 'pay-per-view',
 'tangled',
 'with',
 'the',
 'intercontinental',
 'championship',
 'on',
 'the',
 'line',
 '.',
 'START_TOKEN',
 'brock',
 'grabbed',
 'the',
 'title',
 'from',
 'the',
 'ref',
 'and',
 'draped',
 'it',
 'over',
 'his',
 'shoulder',
 'momentarily',
 'while',
 'glaring',
 'at',
 'rvd',
 '.',
 'START_TOKEN',
 'van',
 'dam',
 "'s",
 'quickness',
 'gave',
 'brock',
 'fits',
 'early',
 'on',
 '.',
 'START_TOKEN',
 'the',
 'big',
 'man',
 'rolled',
 'out',
 'of',
 'the',
 'ring',
 'and',
 'kicked',
 'the',
 'steel',
 'steps',
 'out',
 'of',
 'frustration',
 '.',
 'START_TOKEN',
 'brock',
 'pulled',
 'himself',
 'together',
 'and',
 'began',
 'to',
 'take',
 'charge',
 '.',
 

### Vocab

In [53]:
vocab = set(reviews_as_table['tokenized'].explode().tolist())
for i, val in enumerate(islice(vocab, 10)):
    print(val)
    # do not print vocab in its entirety as it blows up the file size

tungtvannet
foreign/art
luxor
b-flat
one-episode
tura
intend
handpuppet
girdler
retaliates


In [54]:
idx_to_tkn = dict()
tkn_to_idx = dict()

for i, val in enumerate(vocab):
    idx_to_tkn[i] = val
    tkn_to_idx[val] = i

print(list(islice(idx_to_tkn.items(), 10)))
print(list(islice(tkn_to_idx.items(), 10)))

[(0, 'tungtvannet'), (1, 'foreign/art'), (2, 'luxor'), (3, 'b-flat'), (4, 'one-episode'), (5, 'tura'), (6, 'intend'), (7, 'handpuppet'), (8, 'girdler'), (9, 'retaliates')]
[('tungtvannet', 0), ('foreign/art', 1), ('luxor', 2), ('b-flat', 3), ('one-episode', 4), ('tura', 5), ('intend', 6), ('handpuppet', 7), ('girdler', 8), ('retaliates', 9)]


In [74]:
def foo():
    l = []

    def closure(x):
        nonlocal l
        if x['sentiment'] == 'positive':
            t = [1,0]
        else:
            t = [0,1]
        l2 = [tkn_to_idx[tkn] for tkn in x['tokenized']]
        l.append(l2 + t)
    return closure, l

bar, table = foo()
reviews_as_table.apply(bar, axis=1)

df = pd.DataFrame(table, columns=['x' + str(i) for i in range(max_length)] + ['y0', 'y1'])
df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x2909,x2910,x2911,x2912,x2913,x2914,x2915,x2916,y0,y1
0,113904,143004,138473,123201,74545,91365,88969,85869,107115,137517,...,139991,139991,139991,139991,139991,139991,139991,139991,1,0
1,113904,58954,142003,75382,139878,143058,129539,129539,113904,123201,...,139991,139991,139991,139991,139991,139991,139991,139991,1,0
2,113904,132799,64839,143186,107549,58954,142003,137408,57334,105001,...,139991,139991,139991,139991,139991,139991,139991,139991,1,0
3,113904,146172,113152,32380,58954,41362,39803,58954,75382,93190,...,139991,139991,139991,139991,139991,139991,139991,139991,0,1
4,113904,64985,23773,32380,127530,139504,87301,123201,55390,138473,...,139991,139991,139991,139991,139991,139991,139991,139991,1,0


In [76]:
df.to_csv("dataset.csv")