In [None]:
import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()


In [3]:
pip install pyprind




In [6]:
import pyprind
import pandas as pd
import os
# change the 'basepath' to the directory of the
# unzipped movie dataset
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame ()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open (os.path.join (path, file),
                        'r', encoding='utf-8') as infile:
                txt = infile.read ()
            df = df.append([[txt, labels [l]]],
                            ignore_index=True)
            pbar.update()
df. columns = ['review','sentiment']


  df = df.append([[txt, labels [l]]],
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:11:18


In [7]:
import numpy as np

np.random. seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv ('movie_data.csv', index=False, encoding='utf-8')

In [8]:
df = pd.read_csv('movie_data.csv', encoding="utf-8")
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
                'The sun is shining',
                'The weather is sweet',
                'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [10]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [11]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                        norm='l2',
                        smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs))
        .toarray ())


[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


In [13]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [14]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                            text)
    text =(re.sub ('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace ('-', ''))
    return text


In [15]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [16]:
df['review'] = df['review'].apply(preprocessor)

In [17]:
def tokenizer(text):
    return text.split()
tokenizer('runners like ruunning and thus they run')

['runners', 'like', 'ruunning', 'and', 'thus', 'they', 'run']

In [2]:
import nltk

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jada\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runner like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [21]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [22]:
X_train = df.loc [:25000, 'review'].values
y_train = df.loc [:25000, 'sentiment'].values
X_test = df.loc [25000:, 'review'].values
y_test = df.loc [25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                          lowercase=False, 
                         preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer,
                                 tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
                        {'vect__ngram_range': [(1,1)],
                         'vect__stop_words': [stop, None],
                         'vect__tokenizer': [tokenizer,
                                            tokenizer_porter],
                         'vect__use_idf': [False],
                         'vect__norm': [None],
                         'clf__penalty': ['l1', 'l2'],
                         'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf),
                        ('clf',
                         LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=1,
                           n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)


In [None]:
print('Best parameter set: is ' % gs_lr_tfidf.best_params_)
Best parameter set: {'clf__C': 10.0, 'vect__stop_words': None,
'clf__penalty': 'l2', 'vect_tokenizer': <function tokenizer at
0x7f6c704948c8>, , 'vect__ngram _range': (1, 1)}


In [None]:
print ('CV Accuracy: S.3f'
       % gs_lr_tfidf.best_score_)


In [None]:
clf = gs_lr_tfidf.best_estimator_
print ('Test Accuracy: %.3£'
       % clf.score(X_test, y_test))

In [None]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                            text.lower())
    text =(re.sub ('[\W]+', ' ', text.lower()) \
            + ' '.join(emoticons).replace ('-', ''))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords('english')
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        next line in csv:
            text, label = line[:-3], int(line[-2])
            yeild text, label

In [None]:
next(stram_docs(path = movie_data.csv))


In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
        except StopIteration:
            return None, None
        return docs, y

In [None]:
from sklearn.feature_extraction.text import Hashing Vectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVector(decode_error='ignore',
                    n_features=2**21,
                    preprocessor=None,
                    tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [None]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train - vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()
    

In [None]:
X_test, y_test = getminibatch(doc_stream, size=5000)
X_text = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test)) 

In [None]:
clf = clf.prtial_fit(X_test, y_test)

In [None]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8'

In [None]:
from sklearn.feature_etraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                       max_df=.1,
                       max_features=5000)
X = count.fit_transform(df['review'].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10,
                               random_state = 123,
                               learning_method='batch')
X_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                   for i in topic.argsort()\
                       [:-n_top_words - 1:-1]]))

In [None]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print (df['review'][movie_idx][:300], '...')

In [None]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pk1 objects')
if not os.path.exists(dest):
    os.makedirs (dest)
pickle.dump (stop,
             open (os.path.join (dest, 'stopwords.pkl'), 'wb'),
             protocol=4)
pickle.dump (clf,
             open (os.path.join (dest, 'classifier.pkl'), 'wb'),
             protocol=4)


In [5]:
from sklearn.feature_extraction.text 
import HashingVectorizer 
import re import os import pickle
cur dir = os.path.dirname (
file
)
stop = pickle.load (open (
os.path.join(cur_dir,
'pk1 objects'
'stopwords.pkl'), 'rb'))
def tokenizer (text):
text = re.sub ('<[^>]*>',
"', text)
emoticons = re.findall (' (?::I;|=) (?:-)?(?:111(IDIP)',
text.lower ())
text = re.sub('WIts,
"', text.lower ()) I
+ " ".join (emoticons). replace ('-', '")
tokenized = [w for w in text.split () if w not in stop]
return tokenized
vect = HashingVectorizer (decode error='ignore',
n_features=2**21, preprocessor=None, tokenizer=tokenizer)


SyntaxError: invalid syntax (616843549.py, line 1)