## Sentiment Classification with Naive Bayes, Logistic Regression and Ngrams

In [2]:
from fastai import *
from fastai.text import *
import sklearn.feature_extraction.text as sklearn_text

In [3]:
path = untar_data(URLs.IMDB_SAMPLE)

Downloading http://files.fast.ai/data/examples/imdb_sample


In [5]:
import pandas as pd
df = pd.read_csv(path/'texts.csv')

In [6]:
df.head()

Unnamed: 0,label,text,is_valid
0,negative,Un-bleeping-believable! Meg Ryan doesn't even ...,False
1,positive,This is a extremely well-made film. The acting...,False
2,negative,Every once in a long while a movie will come a...,False
3,positive,Name just says it all. I watched this movie wi...,False
4,negative,This movie succeeds at being one of the most u...,False


In [8]:
movie_reviews = (TextList.from_csv(path, 'texts.csv', cols='text')
                         .split_from_df(col=2)
                         .label_from_df(cols=0))

In [9]:
movie_reviews.valid.x[0], movie_reviews.valid.y[0]

(Text xxbos xxmaj this very funny xxmaj british comedy shows what might happen if a section of xxmaj london , in this case xxmaj xxunk , were to xxunk itself independent from the rest of the xxup uk and its laws , xxunk & post - war xxunk . xxmaj merry xxunk is what would happen . 
  
   xxmaj the explosion of a wartime bomb leads to the xxunk of ancient xxunk which show that xxmaj xxunk was xxunk to the xxmaj xxunk of xxmaj xxunk xxunk ago , a small historical xxunk long since forgotten . xxmaj to the new xxmaj xxunk , however , this is an unexpected opportunity to live as they please , free from any xxunk from xxmaj xxunk . 
  
   xxmaj stanley xxmaj xxunk is excellent as the minor city xxunk who suddenly finds himself leading one of the world 's xxunk xxunk . xxmaj xxunk xxmaj margaret xxmaj xxunk is a delight as the history professor who sides with xxmaj xxunk . xxmaj others in the stand - out cast include xxmaj xxunk xxmaj xxunk , xxmaj paul xxmaj xxunk , xxmaj xxunk xxmaj xxunk ,

In [10]:
len(movie_reviews.train.x), len(movie_reviews.valid.x)

(800, 200)

In [13]:
movie_reviews.vocab.stoi['intelligence']

1098

In [15]:
movie_reviews.vocab.itos[1098]

'intelligence'

In [16]:
t = movie_reviews.train[0][0]

In [17]:
t.data[:20]

array([   2,    5, 4619,   25,    0,   25,  867,   52,    5, 3776,    5, 1800,   95,   37,   85,  192,   64,  935,
          0, 2738])

In [18]:
movie_reviews.y.classes

['negative', 'positive']

### Full Data Set

In [19]:
path = untar_data(URLs.IMDB)

Downloading https://s3.amazonaws.com/fast-ai-nlp/imdb


In [21]:
(path/'train').ls()

[PosixPath('/Users/ujjawalpathak/.fastai/data/imdb/train/neg'),
 PosixPath('/Users/ujjawalpathak/.fastai/data/imdb/train/pos'),
 PosixPath('/Users/ujjawalpathak/.fastai/data/imdb/train/unsupBow.feat'),
 PosixPath('/Users/ujjawalpathak/.fastai/data/imdb/train/labeledBow.feat')]

In [23]:
reviews_full = (TextList.from_folder(path)
             #grab all the text files in path
             .split_by_folder(valid='test')
             #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
             .label_from_folder(classes=['neg', 'pos']))
             #label them all with their folders

In [24]:
len(reviews_full.train), len(reviews_full.valid)

(25000, 25000)

In [25]:
v = reviews_full.vocab

In [29]:
v.itos[200:220]

['ca',
 'down',
 'got',
 'want',
 "'re",
 'things',
 'pretty',
 'young',
 'around',
 'seems',
 'horror',
 '&',
 'however',
 'fact',
 'take',
 'big',
 'long',
 'enough',
 'thought',
 'series']

In [30]:
def get_term_doc_matrix(label_list, vocab_len):
    j_indices = []
    indptr = []
    values = []
    indptr.append(0)

    for i, doc in enumerate(label_list):
        feature_counter = Counter(doc.data)
        j_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        indptr.append(len(j_indices))
        
#     return (values, j_indices, indptr)

    return scipy.sparse.csr_matrix((values, j_indices, indptr),
                                   shape=(len(indptr) - 1, vocab_len),
                                   dtype=int)

In [31]:
trn_term_doc = get_term_doc_matrix(reviews_full.train.x, len(reviews_full.vocab.itos))

In [32]:
val_term_doc = get_term_doc_matrix(reviews_full.valid.x, len(reviews_full.vocab.itos))

### Save Data

In [33]:
import scipy

In [36]:
scipy.sparse.save_npz('val_term_doc.npz',val_term_doc)
scipy.sparse.save_npz('trn_term_doc.npz', trn_term_doc)

In [37]:
trn_term_doc = scipy.sparse.load_npz('trn_term_doc.npz')
val_term_doc = scipy.sparse.load_npz('val_term_doc.npz')

## Naive Bayes

In [38]:
x = trn_term_doc
y = reviews_full.train.y

val_y = reviews_full.valid.y.items

In [39]:
positive = y.c2i['pos']
negative = y.c2i['neg']

In [40]:
import numpy as np

In [None]:
p0 = np.squeeze(np.asarray(x[y.items==positive].sum(0)))
p1 = np.squeeze(np.asarray)