In [39]:
import pyprind
import pandas as pd
import os
import numpy as np
import re

In [47]:
from pathlib import Path

def build_review_df():
    df = pd.DataFrame()
    data_dir = '/home/irvind/dev/ml/raschka/nlp/aclImdb'
    label_mapping = {'neg': 0, 'pos': 1}
    progress_bar = pyprind.ProgBar(50000)

    for data_set in ('train', 'test'):
        for label in ('neg', 'pos'):
            dir_path = os.path.join(data_dir, data_set, label)
            filenames = os.listdir(dir_path)
            for filename in sorted(filenames):
                filename_full_path = os.path.join(dir_path, filename)
                with open(filename_full_path, 'r') as f:
                    file_content = f.read()
                df = df.append([
                    [file_content, label_mapping[label]]
                ], ignore_index=True)
                progress_bar.update()
                # print(file_content)
                # break
    df.columns = ['review', 'sentiment']
    return df

def preprocessor(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub(r'[\W]+', ' ', text)
    text += ' ' + ' '.join(emoticons).replace('-', '')
    return text

if Path('movie_data.csv').exists():
    print('getting df from file')
    df = pd.read_csv('movie_data.csv')
else:
    print('building df')
    df['review'] = df['review'].apply(preprocessor)
    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))
    df.to_csv('movie_data.csv', index=False, encoding='utf-8')

df

getting df from file


Unnamed: 0,review,sentiment
0,I do miss the company Vestron they sure had th...,1
1,This is a nice little movie with a nice story ...,1
2,The MTV sci fi animated series Æon Flux is bro...,0
3,My only question is Why did they make this mov...,0
4,I saw the 7 5 IMDb rating on this movie and on...,0
...,...,...
49995,I have never seen a show as good as Full House...,1
49996,I usually have a difficult time watching a TV ...,1
49997,I rented this film because of my interest in A...,1
49998,When Alfred Hitchcock made STRANGERS ON A TRAI...,1


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
# print(df.iloc[1].review)

count_vectorizer = CountVectorizer()
# first_five_rows = df.iloc[range(100)]
# bag = count_vectorizer.fit_transform(first_five_rows.review)
bag = count_vectorizer.fit_transform(df.review)
print('bag shape:', bag.shape)

# Так делать нельзя, 50000 * 101895 int64 элементов - это примерно 39 Гб, которые нужно развернуть в оперативке при вызове .toarray()
# В памяти храняться разряженные данные, с множеством нулей.
# bag.toarray()

bag shape: (50000, 101895)


MemoryError: Unable to allocate 38.0 GiB for an array with shape (50000, 101895) and data type int64

In [None]:
for word, word_idx in list(count_vectorizer.vocabulary_.items())[:10]:
    print(f"word: '{word}' idx: {word_idx}")
    
len(count_vectorizer.vocabulary_)
count_vectorizer.vocabulary_

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
tfid_transformer = TfidfTransformer(
    use_idf=True,
    norm='l2',
    smooth_idf=True
)
tfid_bag = tfid_transformer.fit_transform(bag)
print('tfid_bag shape', tfid_bag.shape)
tfid_bag.toarray()
# tfid_bag.toarray()[0]

tfid_bag shape (5, 342)


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07928972, 0.07928972, 0.07928972, ..., 0.        , 0.06397045,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.10364658,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.09525021],
       [0.        , 0.        , 0.        , ..., 0.0670471 , 0.        ,
        0.        ]])