In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
'movie_data.csv' in os.listdir()

True

In [3]:
df = pd.read_csv("movie_data.csv", encoding="utf-8")

<b>
type(df) <br>
df.index, type(df.index) <br>
df.columns, type(df.columns) <br>
df.shape <br>
df.head(), df.tail() <br> 
df.describe() <br>
df.info() <br>
df.count(), df.dtypes, df.isnull().sum()
</b>

In [4]:
type(df) # pandas.core.frame.DataFrame
df.index # RangeIndex(start=0, stop=50000, step=1)
type(df.index) # pandas.core.indexes.range.RangeIndex
type(df.columns) # pandas.core.indexes.base.Index

pandas.core.indexes.base.Index

In [5]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [6]:
df.shape

(50000, 2)

In [7]:
df.head() # sentiment column already shuffled

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [8]:
df.tail(n=3)

Unnamed: 0,review,sentiment
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


In [9]:
df.describe() # describe only gives statistics of numeric columns

Unnamed: 0,sentiment
count,50000.0
mean,0.5
std,0.500005
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 781.3+ KB


##### How to convert sentiment to categorical variable??

In [11]:
df.count()

review       50000
sentiment    50000
dtype: int64

In [12]:
df.dtypes

review       object
sentiment     int64
dtype: object

In [13]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
#np.random.seed(0)

<b> if you don't set the random seed, each time you run the code, below will give different set of random numbers.
If you set the seed, every time same set of random numbers will be generated. Three printout give different random numbers.
<b>

In [14]:
print(np.random.permutation(10))
print(np.random.permutation(10))
print(np.random.permutation(10))

[8 6 0 5 1 9 4 2 3 7]
[8 4 0 5 7 6 9 2 1 3]
[2 6 5 1 0 8 7 9 4 3]


In [15]:
np.arange(0, 20, 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [16]:
np.random.permutation(df.index)

array([26674, 25552, 35942, ..., 43188, 40018, 27455], dtype=int64)

<b> bag-of-words model <br>
transforming documents into feature vectors
</b>

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

In [18]:
count

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
#docs = np.array([
#        'The sun is shining',
#        'The weather is sweet',
#        'The sun is shining, the weather is sweet, and one and one is two'])

In [28]:
docs = ['The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two']

In [29]:
docs

['The sun is shining',
 'The weather is sweet',
 'The sun is shining, the weather is sweet, and one and one is two']

##### the docs can be list or array

In [30]:
bag = count.fit_transform(docs)

In [31]:
type(count) # sklearn.feature_extraction.text.CountVectorizer

sklearn.feature_extraction.text.CountVectorizer

In [32]:
type(bag) # scipy.sparse.csr.csr_matrix

scipy.sparse.csr.csr_matrix

In [33]:
bag

<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [34]:
bag.shape # 3 documents, 9 tokens/ unique words

(3, 9)

In [35]:
#dir(count)
count.vocabulary_ # token and its index

{'and': 0,
 'is': 1,
 'one': 2,
 'shining': 3,
 'sun': 4,
 'sweet': 5,
 'the': 6,
 'two': 7,
 'weather': 8}

In [36]:
for i in docs:
    print(i)

The sun is shining
The weather is sweet
The sun is shining, the weather is sweet, and one and one is two


#### feature vector, term frequency matrix

In [37]:
bag.toarray() # this is the feature vector

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

<b>
term frequency : tf(t, d) - number of times a term t occurs in a document d. <br>
document frequency : df(d, t)- number of documents d that contains the term t. <br>
total number of documents : n_d <br>
inverse document frequency : idf(t, d) 
$$
idf(t, d) = log\frac{n_d}{1+df(d,t)}
$$
term frequency-inverse document frequency : 
$$
tf-idf(t, d) = tf(t,d) \times idf(t, d)
$$
</b>

#### term-frequency inverse-ducument frequency

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision=2)
tfidf

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [43]:
tfidf_array = tfidf.fit_transform(bag.toarray())
type(tfidf_array)

scipy.sparse.csr.csr_matrix

In [44]:
tfidf_array.shape

(3, 9)

In [45]:
tfidf_array.toarray()

array([[0.  , 0.43, 0.  , 0.56, 0.56, 0.  , 0.43, 0.  , 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.  , 0.56, 0.43, 0.  , 0.56],
       [0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19]])

In [46]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

In [None]:
#np.set_printoptions(precision=2)
#print(tfidf.fit_transform(count.fit_transform(docs).toarray()))

In [47]:
tfidf

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [48]:
tfidf_raw = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
tfidf_raw_data = tfidf_raw.fit_transform(bag.toarray())[-1]
tfidf_raw_data.toarray()

array([[3.39, 3.  , 3.39, 1.29, 1.29, 1.29, 2.  , 1.69, 1.29]])

In [49]:
l2_tfidf = tfidf_raw_data.toarray()/np.sqrt(np.sum(tfidf_raw_data.toarray()**2))
l2_tfidf

array([[0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19]])

### Cleaning text data

In [51]:
df.loc[0, "review"][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [54]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [55]:
preprocessor(df.loc[0, "review"][-50:])

'is seven title brazil not available'

In [56]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [57]:
preprocessor(' would be money well spent.<br /><br />8 out of 10')

' would be money well spent 8 out of 10'

In [58]:
df["review"] = df["review"].apply(preprocessor)

In [61]:
#df.loc[1, "review"]

#### Processing documents into tokens

In [62]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [63]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [64]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [65]:
#import nltk
#nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hisahoo.ISC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [67]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [70]:
len(stop) # 179 stopwords
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [72]:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [1]:
import time

In [9]:
start_time = time.time()
#start_time1 = time.clock()
print("I am here")
print("--- %s seconds --- " %(time.time() - start_time))
#print("--- %s seconds --- " %(time.time() - start_time1))

I am here
--- 0.0005006790161132812 seconds --- 
