In [1]:
import pandas as pd 
import os
import numpy as np 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
basepath = 'aclImdb'

In [3]:
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()


In [5]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file),
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = pd.concat([df, pd.DataFrame({'review': [txt], 'sentiment': [labels[l]]})], ignore_index=True)
            
df.columns = ['review', 'sentiment']

In [6]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
39178,My Take: A tired formula Christmas comedy. The...,0
14283,"Michael Myers, the deranged, not-so-young-anym...",0
9945,I must tell you the truth. The only reason I w...,1


# Bag of words model

### Transforming text to feature vectors 

In [None]:
#Bag of words model
# Transforming text to feature vectors

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'
])

bag = count.fit_transform(docs)

In [11]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [12]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


# Term frequency-inverse document frequency (tf-idf)

###   TF-IDF is used to measure how important a word is to a document



In [15]:
tfidf = TfidfTransformer(use_idf = True,
                         norm = 'l2', 
                         smooth_idf = True)

np.set_printoptions(precision = 2)
print(tfidf.fit_transform(count.fit_transform(docs))
      .toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]
