In [None]:
from sklearn.linear_model import LogisticRegression
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import numpy as np
from matplotlib import pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import CountVectorizer

# Sentiment Analysis Using IMDb dataset

* Sentiment analysis is about classification of the polarity of a given text

* write those command at linux bash:

wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

gunzip aclImdb_v1.tar.gz

tar -xvf aclImdb_v1.tar

## Creating Dataset

In [None]:
from pathlib import Path

In [None]:
import os

In [None]:
PATH= Path('data/aclImdb/')
names = ['neg','pos']

In [None]:
PATH/"val"

WindowsPath('data/aclImdb/val')

### Adding Custom Made Method

In [None]:
Path.ls = lambda x: list(x.iterdir())

In [None]:
PATH.ls()

[WindowsPath('data/aclImdb/imdb.vocab'),
 WindowsPath('data/aclImdb/imdbEr.txt'),
 WindowsPath('data/aclImdb/README'),
 WindowsPath('data/aclImdb/test'),
 WindowsPath('data/aclImdb/train')]

In [None]:
a = PATH/"train"

In [None]:
a

WindowsPath('data/aclImdb/train')

### scandir()

* This will give us the list of files in the path that we have given 

In [None]:
list(os.scandir('data/aclImdb/train'))

[<DirEntry 'labeledBow.feat'>,
 <DirEntry 'neg'>,
 <DirEntry 'pos'>,
 <DirEntry 'unsup'>,
 <DirEntry 'unsupBow.feat'>,
 <DirEntry 'urls_neg.txt'>,
 <DirEntry 'urls_pos.txt'>,
 <DirEntry 'urls_unsup.txt'>]

In [None]:
list(os.scandir(a))

[<DirEntry 'labeledBow.feat'>,
 <DirEntry 'neg'>,
 <DirEntry 'pos'>,
 <DirEntry 'unsup'>,
 <DirEntry 'unsupBow.feat'>,
 <DirEntry 'urls_neg.txt'>,
 <DirEntry 'urls_pos.txt'>,
 <DirEntry 'urls_unsup.txt'>]

### iterdir()

* This will give us the list of paths of the files in the path that we have given 

In [None]:
a.iterdir()

<generator object Path.iterdir at 0x0000029B8402E648>

In [None]:
list(a.iterdir())

[WindowsPath('data/aclImdb/train/labeledBow.feat'),
 WindowsPath('data/aclImdb/train/neg'),
 WindowsPath('data/aclImdb/train/pos'),
 WindowsPath('data/aclImdb/train/unsup'),
 WindowsPath('data/aclImdb/train/unsupBow.feat'),
 WindowsPath('data/aclImdb/train/urls_neg.txt'),
 WindowsPath('data/aclImdb/train/urls_pos.txt'),
 WindowsPath('data/aclImdb/train/urls_unsup.txt')]

### Writing function to get specific file format

In [None]:
def _get_files(p, fs, extensions = None):
    p = Path(p)
    res = [p/f for f in fs if not f.startswith(".") 
           and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
    return res

### Creating Dataset from File

In [None]:
def create_ds_from_file(src, names):
    texts, labels = [], []
    
    for idx, name in enumerate(names):
        path = src/name
        print(path)
        t = [o.name for o in os.scandir(path)]
        t = _get_files(path, t, extensions = [".txt"])
        for e in t:
            l = [open(e).read().strip()]
            texts += l
        labels += ([idx] * len(t))
    return texts, np.array(labels) 

In [None]:
def create_ds_from_file(src, names):
    texts, labels = [], []
    
    for idx, name in enumerate(names):
        path = src/name
        print(path)
        t = [o.name for o in os.scandir(path)]
        t = _get_files(path, t, extensions = [".txt"])
        for e in t:
            l = [open(e, encoding="ISO-8859-1").read().strip()]
            texts += l
        labels += ([idx] * len(t))
    return texts, np.array(labels)

### Creating our train and validation set

In [None]:
trn_x, trn_y = create_ds_from_file(PATH/"train",names)


data\aclImdb\train\neg
data\aclImdb\train\pos


In [None]:
val_x,val_y = create_ds_from_file(PATH/"test",names)

data\aclImdb\test\neg
data\aclImdb\test\pos


* Here is the text of the first review in train set

In [None]:
trn_x[0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

* Label of the first review of training set

In [None]:
trn_y[0]

0

In [None]:
val_x[0]

"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."

## Tokenizing

In nlp we have to turn our text into list of words, and that process is called **Tokenization**

* But this is not a trivial task

* Like, if we have "This movie isn't fun."

* It should be like: This movie is n't fun .

[`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) converts a collection of text documents to a matrix of token counts (part of `sklearn.feature_extraction.text`).

### Creating Our Tokenizer

In [None]:
import re
import string

In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [None]:
vectorizer = CountVectorizer(tokenizer=tokenize)

* `fit_transform(trn)` creates the vocabulary from words in training set and it transforms the training set into a term-document matrix. 

In [None]:
trn_term_doc = vectorizer.fit_transform(trn_x)

* We have to apply the same transformation to our validation set

* This is just using vectorizer that is fitted to our training set

* If there is an unseen word in validation set, it will fall into the unknown column

In [None]:
val_term_doc = vectorizer.transform(val_x)

### Sparse Matrix

In [None]:
trn_term_doc

<25000x75780 sparse matrix of type '<class 'numpy.int64'>'
	with 3750614 stored elements in Compressed Sparse Row format>

* 25000x75780 : There are 25000 movie reviews, and there are 75780 unique words

* Most documents won't have most of the 75780 words. So it will be very wasteful to store 25000x75780 into the memory.

* We will store it as **sparse matrix**

* There are different ways of storing sparse matrix but one method is like:

* (1,4) -> 4 : document number 1, term number 4 appears 4 times

* (3,14) -> 11 : document number 3, term number 14 appears 11 times etc..

* It is more efficient to store it that way

### Viewing Words from our Vocabulary

In [None]:
vocab = vectorizer.get_feature_names(); vocab[5000:5005]

['augers', 'auggie', 'augh', 'aughties', 'augie']

### Splitting our document and Creating Unique set

* Tokenizer will not split that way but if we just split it by space it will be like

In [None]:
w0 = set([o.lower() for o in trn_x[0].split(' ')]); w0

{'a',
 'absurd',
 'an',
 'and',
 'audience',
 'be',
 'better',
 'briefly.',
 'by',
 'can',
 'chantings',
 'cinematography',
 'comedy.',
 'crazy',
 'cryptic',
 'dialogue',
 'easy',
 'era',
 'even',
 'eventually',
 'example',
 'feelings',
 'for',
 'formal',
 'forrest',
 'frederic',
 'from',
 'future',
 'general',
 'good',
 'grader.',
 'great',
 'has',
 'insane,',
 'into',
 'is',
 'it',
 "it's",
 'just',
 'kirkland',
 'level',
 'make',
 'making',
 'man',
 'might',
 'mob',
 'narrative',
 'no',
 'of',
 'off',
 'off.',
 'on',
 'opening',
 'orchestra',
 'out',
 'pig.',
 'putting.',
 'sally',
 'scene',
 'seem',
 'seen',
 'shakespeare',
 'should',
 'singers.',
 'some',
 'stars',
 'starts',
 'stays',
 'story',
 'technical',
 'terrific',
 'than',
 'that',
 'the',
 'think',
 'third',
 'those',
 'time',
 'to',
 'too',
 'turned',
 'unfortunately',
 'unnatural',
 'vilmos',
 'violent',
 'who',
 'whole',
 'with',
 'would',
 'you',
 'zsigmond.'}

### Viewing vectorized document 1

* Only 93 of the 75780 elements were used in this document

In [None]:
trn_term_doc[0]

<1x75780 sparse matrix of type '<class 'numpy.int64'>'
	with 93 stored elements in Compressed Sparse Row format>

In [None]:
# index of "absurd"
vectorizer.vocabulary_['absurd']

1311

In [None]:
trn_term_doc[0,1311]

2

In [None]:
trn_term_doc[0,5000]

0