In [19]:
import re
import pandas as pd
import nltk
from itertools import islice
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load into environment as Pandas DF

In [2]:
reviews_as_table = pd.read_csv('IMDB Dataset.csv')
reviews_as_table.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocessing

In [7]:
def foo(x):
    x = re.sub(r'[^\x00-\x7f]', r'', x)
    x = x.lower()
    x = "<s> " + x
    x = x.replace("<br /><br />"," <br /><br /><s> ").replace(". ", ". <s> ").replace("<s> <br />", "<br />")
    x = x.replace("<br />", " LINE_BREAK ").replace("<s>", " START_TOKEN ")

    return x

reviews_as_table["cleaned"] = reviews_as_table["review"].apply(foo)

reviews_as_table["cleaned"][0]

" START_TOKEN  one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked.  START_TOKEN  they are right, as this is exactly what happened with me.  LINE_BREAK  LINE_BREAK  START_TOKEN  the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go.  START_TOKEN  trust me, this is not a show for the faint hearted or timid.  START_TOKEN  this show pulls no punches with regards to drugs, sex or violence.  START_TOKEN  its is hardcore, in the classic use of the word.  LINE_BREAK  LINE_BREAK  START_TOKEN  it is called oz as that is the nickname given to the oswald maximum security state penitentary.  START_TOKEN  it focuses mainly on emerald city, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda.  START_TOKEN  em city is home to many..aryans, muslims, gangstas, latinos, christians, italians, irish and more..

### Tokenization

In [8]:
reviews_as_table["tokenized"] = reviews_as_table["cleaned"].apply(lambda x: word_tokenize(x))
reviews_as_table["tokenized"][0]

['START_TOKEN',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'you',
 "'ll",
 'be',
 'hooked',
 '.',
 'START_TOKEN',
 'they',
 'are',
 'right',
 ',',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 '.',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 ',',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 '.',
 'START_TOKEN',
 'trust',
 'me',
 ',',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 '.',
 'START_TOKEN',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 ',',
 'sex',
 'or',
 'violence',
 '.',
 'START_TOKEN',
 'its',
 'is',
 'hardcore',
 ',',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 '.',
 'L

### obtaining `max_length`

In [11]:
counts = reviews_as_table["tokenized"].apply(lambda x: len(x))
max_length = counts.max()
max_length

2888

In [32]:
def foo(x):
    for _ in range(max_length - len(x)):
        x.append("NULL_TOKEN")
    return x

reviews_as_table["tokenized"] = reviews_as_table["tokenized"].apply(foo)
reviews_as_table["tokenized"][0]

['START_TOKEN',
 'one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 '1',
 'oz',
 'episode',
 'you',
 "'ll",
 'be',
 'hooked',
 '.',
 'START_TOKEN',
 'they',
 'are',
 'right',
 ',',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 '.',
 'LINE_BREAK',
 'LINE_BREAK',
 'START_TOKEN',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 ',',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 '.',
 'START_TOKEN',
 'trust',
 'me',
 ',',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 '.',
 'START_TOKEN',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 ',',
 'sex',
 'or',
 'violence',
 '.',
 'START_TOKEN',
 'its',
 'is',
 'hardcore',
 ',',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 '.',
 'L

In [33]:
len(reviews_as_table["tokenized"][0])

2888

### Vocab

In [34]:
vocab = set(reviews_as_table['tokenized'].explode().tolist())
for i, val in enumerate(islice(vocab, 10)):
    print(val)
    # do not print vocab in its entirety as it blows up the file size

casino
as'kentucky
schrott
bleepesque
idiom
'yes
best.gugino
biomedical
o'brian
slogan


In [35]:
idx_to_tkn = dict()
tkn_to_idx = dict()

for i, val in enumerate(vocab):
    idx_to_tkn[i] = val
    tkn_to_idx[val] = i

print(list(islice(idx_to_tkn.items(), 10)))
print(list(islice(tkn_to_idx.items(), 10)))

[(0, 'casino'), (1, "as'kentucky"), (2, 'schrott'), (3, 'bleepesque'), (4, 'idiom'), (5, "'yes"), (6, 'best.gugino'), (7, 'biomedical'), (8, "o'brian"), (9, 'slogan')]
[('casino', 0), ("as'kentucky", 1), ('schrott', 2), ('bleepesque', 3), ('idiom', 4), ("'yes", 5), ('best.gugino', 6), ('biomedical', 7), ("o'brian", 8), ('slogan', 9)]


In [None]:
### TODO
# convert to a list of indices
# generate csv from there

# plan
# make a new dataframe with 2888 + 2 columns
# last 2 columns are either 0 1 or 1 0 

# generate csv using pd.df.to_csv()
# how can we give it a header x0,x1,...,x2887,y0,y1