In [1]:
import os
import yaml
import nltk
import string
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
credentials_file = os.path.abspath(os.path.join('..', 'credentials.yaml'))
with open(credentials_file, 'r') as f:
    credentials = yaml.safe_load(f)

storage_options = {
    'key': credentials['aws']['access_key'],
    'secret': credentials['aws']['secret_access_key']
}

In [3]:
partition_uri = f's3://{credentials["aws"]["bucket"]}/reviews/reviews_partition_1.csv'
partition = pd.read_csv(
    partition_uri,
    compression='gzip',
    storage_options=storage_options
)
partition = partition.drop('author', axis=1).astype({'review_date': 'datetime64[D]'})
partition['positive_sentiment'] = partition['rating'] > 5
mem = partition.memory_usage(deep=True).sum()/1024/1024
print('Number of reviews:', len(partition))
print(f'Memory usage: {mem:.1f} Mb')
partition.head()

Number of reviews: 242940
Memory usage: 319.9 Mb


Unnamed: 0,text,rating,title,movie_id,upvotes,total_votes,review_date,positive_sentiment
0,"Chucky (the murderous doll from ""Child's Play""...",9.0,Silly but fun,/title/tt0144120/,33,40,2006-07-06,True
1,"They obviously made ""Bride of Chucky"" with the...",10.0,glass ceiling has a new meaning,/title/tt0144120/,17,20,2009-03-21,True
2,Well my opinion has changed for this one becau...,10.0,Who The (Beep) Is Martha Stewart?(**** Out Of...,/title/tt0144120/,19,22,2012-12-13,True
3,Clever is the word that comes to mind when I t...,6.0,A rough ride to Hackensack for Chucky and his...,/title/tt0144120/,11,15,2008-10-29,True
4,Realizing he needs to turn back into human for...,10.0,The best of the series,/title/tt0144120/,11,15,2012-08-23,True


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
count_vect = CountVectorizer()

test_data = pd.DataFrame({'text': ['I see a cat', 'This cat is very small'], 'sentiment': [1, 1]})
test_data

Unnamed: 0,text,sentiment
0,I see a cat,1
1,This cat is very small,1


In [9]:
CountVectorizer?

[0;31mInit signature:[0m
[0mCountVectorizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput[0m[0;34m=[0m[0;34m'content'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;34m'utf-8'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecode_error[0m[0;34m=[0m[0;34m'strict'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrip_accents[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlowercase[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpreprocessor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstop_words[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken_pattern[0m[0;34m=[0m[0;34m'(?u)\\b\\w\\w+\\b'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mngram_range[0m[0;34m=[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0;36m1

In [10]:
tokens = count_vect.fit_transform(test_data['text'])
tokens

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [12]:
count_vect.vocabulary_

{'see': 2, 'cat': 0, 'this': 4, 'is': 1, 'very': 5, 'small': 3}