# Logistic Regression for Sentiment Analysis

A logistic regression model to classify movie reviews from the 50k IMDb review dataset

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import time 
import pandas

nltk.download('stopwords')
nltk.download('opinion_lexicon')
nltk.download('wordnet')

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, opinion_lexicon
from sklearn.preprocessing import normalize

from sklearn.feature_extraction.text import HashingVectorizer

np.random.seed(0) 

[nltk_data] Downloading package stopwords to /home/ghost/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/ghost/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ghost/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Preprocessing Text Data

The last five reviews from the 50k IMBb review dataset

In [2]:
df = pd.read_csv('shuffled_movie_data.csv')
df.tail()

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


Builing a dictionary with positive and negative opinions with a position on each value.

In [3]:
neg_words = opinion_lexicon.negative()
pos_words = opinion_lexicon.positive()

pos_dict = {}
neg_dict = {}

npos = len(pos_words)
nneg = len(neg_words)

for ipos, pos_word in enumerate(pos_words): pos_dict[pos_word] = ipos + 29
for ineg, neg_word in enumerate(neg_words): neg_dict[neg_word] = ineg + npos

Creating a list with pronouns

In [4]:
stop     = stopwords.words('english')
pronouns = stop[:35]
not_pr   = stop[35:]

pronouns1 = pronouns[:8]
pronouns1.append('us')
pronouns2 = pronouns[8:17]
pronouns3 = pronouns[17:]

not_pr.remove('no')
not_pr.remove('not')
not_pr.remove('nor')

porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

We define a tokenizer to split the text into word tokens. Also we remove HTML tags, emoticons and all words that not belong to the pronouns set. Finally with apply the Porter stemming and WordNet Lemmatizer algorithm to convert the words into their root form.

In [5]:
def tokenizer(text):
    text      = re.sub('<[^>]*>', '', text)
    #emoticons = re.findall('(?::|;|=|x|8|\()(?:-|y)?(?:\)|s|\(|\)|d|p|c|3|\[|\]|\||\\\\|\\/)', text.lower())
    text      = re.sub(    '(?::|;|=|x|8|\()(?:-|y)?(?:\)|s|\(|\)|d|p|c|3|\[|\]|\||\\\\|\\/)', '',text.lower())
    text      = re.sub('[\s\?\[\]\,\;\.\:\-\\_\(\)\"]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    text      = [w for w in text.split() if w not in not_pr]
    tokenized = []
    for w in text:
        if w in neg_dict or w in pos_dict:
            tokenized.append(w)
        else:
            w_s = porter.stem(w)
            if w_s in neg_dict or w in pos_dict:
                tokenized.append(w_s)
            else:
                w_l = wordnet_lemmatizer.lemmatize(w)
                tokenized.append(w_l)
    return tokenized

## Exercise 1

We create a DocToVec class to convert all documents to a feature vector using a simple tokenizer.

In [6]:
# Excercise 1: define new features according to https://web.stanford.edu/~jurafsky/slp3/5.pdf

class DocToVec:
    """
        Convert a list of words to feature vectors:
        
        Arguments:
            
            n_features: number of features per vector, by default 6818.
            fn        : tokenizer function
    """
    
    def __init__(self, n_features, fn):
        if n_features is None:
            self.n_features = 29
            self.fn         = tokenizer
        self.n_features = n_features
        self.fn         = fn
    def transform(self, DocList):
    """
        Transform an array of list of words to a matrix:
        
        Arguments:
            
            DocList: Array of list of Words tokenized
            
        Returns:
            
            Matrix with feature vectors of each review.
            
        feature0: Word Length
        feature1: Expression ! count
        feature2: Positive Words count
        feature3: Negative Words count
        feature4: Pronouns of 1st person count 
        feature5: Pronouns of 2nd person count 
        feature6: Pronouns of 3rd person count 
        feature7: Words like 'no' count
        feature8: Words like 'not' count
        feature9: Words like 'nor' or 'neither' count
        
        feature10 - feature28: Combinations of some feature that have a good correlation
        feature29 - end      : All Positive and Negative words 
        
    """
    
        n   = len(DocList)
        vec = np.zeros(shape=(n, self.n_features), dtype = np.float32)
        mt  = 0.0
        for idx, strn in enumerate(DocList):
            st = time.time()
            strnList = self.fn(strn)
            vec[idx][0] = np.log(len(strnList))
            for strn in strnList:
                exp      = re.findall('!', strn)
                
                if len(exp) != 0:
                    strn         = re.sub('!','',strn)
                    vec[idx][1] += len(exp) 
                    
                if strn in pos_dict:
                    vec[idx][2] += 1 
                    vec[idx][pos_dict[strn]] +=1
                    
                elif strn in neg_dict:
                    vec[idx][3] += 1 
                    vec[idx][neg_dict[strn]] +=1

                elif strn in pronouns1:
                    vec[idx][4] += 1  

                elif strn in pronouns2:
                    vec[idx][5] += 1 
                
                elif strn in pronouns3:
                    vec[idx][6] += 1 
                
                elif strn == 'no':
                    vec[idx][7] += 1
                
                elif strn == 'not':
                    vec[idx][8] += 1
                    
                elif strn in ['nor','neither']:
                    vec[idx][9] += 1
                    
                vec[idx][10] = vec[idx][2] * vec[idx][3]

                vec[idx][11] = vec[idx][2] * vec[idx][4]
                vec[idx][12] = vec[idx][2] * vec[idx][5]
                vec[idx][13] = vec[idx][2] * vec[idx][6]
                vec[idx][14] = vec[idx][2] * vec[idx][7]
                vec[idx][15] = vec[idx][2] * vec[idx][8]  
                vec[idx][16] = vec[idx][2] * vec[idx][9]  
                
                vec[idx][17] = vec[idx][3] * vec[idx][4]
                vec[idx][18] = vec[idx][3] * vec[idx][5]
                vec[idx][19] = vec[idx][3] * vec[idx][6]
                vec[idx][20] = vec[idx][3] * vec[idx][7]
                vec[idx][21] = vec[idx][3] * vec[idx][8]  
                vec[idx][22] = vec[idx][3] * vec[idx][9]  
                
            vec[idx][23] = vec[idx][2]**2
            vec[idx][24] = vec[idx][3]**2
            vec[idx][25] = vec[idx][2]**3
            vec[idx][26] = vec[idx][3]**3
            vec[idx][27] = np.sqrt(vec[idx][2])
            vec[idx][28] = np.sqrt(vec[idx][3])
            dt = time.time() - st
            mt+= dt
            ln = 36*(idx + 1)
            
            if ((idx+1) % 500 == 0):
                print('percent = {:4.1f}%, time estimated : {:3.1f} min'.format(100*(idx+1)/n, mt*n/ln))
        return vec        

These functions below were not used here

In [7]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label    

In [8]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
            
    except StopIteration:
        return None, None
    
    return docs, y

## Exercise 2

In [9]:
## Exercise 2: implement a Logistic Regression classifier, using regularization

FEATURES = 29 + npos + nneg
vect = DocToVec(n_features = FEATURES, fn = tokenizer)

vect.transform([df.head().values[0][0]]).shape

(1, 6818)

In [10]:
stream = stream_docs(path='shuffled_movie_data.csv')

X_train, y_train = get_minibatch(stream, size=50000)
y_train = np.asarray(y_train).reshape((-1, 1))

X_train = vect.transform(X_train)

np.save('vec_data_X', X_train)
np.save('vec_data_y', y_train)

percent =  2.0%, time estimated : 5.9 min
percent =  4.0%, time estimated : 5.8 min
percent =  6.0%, time estimated : 5.8 min
percent =  8.0%, time estimated : 5.8 min
percent = 10.0%, time estimated : 5.8 min
percent = 12.0%, time estimated : 5.8 min
percent = 14.0%, time estimated : 5.8 min
percent = 16.0%, time estimated : 5.8 min
percent = 18.0%, time estimated : 5.8 min
percent = 20.0%, time estimated : 5.8 min
percent = 22.0%, time estimated : 5.8 min
percent = 24.0%, time estimated : 5.8 min
percent = 26.0%, time estimated : 5.8 min
percent = 28.0%, time estimated : 5.8 min
percent = 30.0%, time estimated : 5.8 min
percent = 32.0%, time estimated : 5.8 min
percent = 34.0%, time estimated : 5.8 min
percent = 36.0%, time estimated : 5.8 min
percent = 38.0%, time estimated : 5.8 min
percent = 40.0%, time estimated : 5.8 min
percent = 42.0%, time estimated : 5.8 min
percent = 44.0%, time estimated : 5.8 min
percent = 46.0%, time estimated : 5.8 min
percent = 48.0%, time estimated : 

## Exercise 2: implement a Logistic Regression classifier, using regularization

In [11]:
X_data_r = normalize(np.load('vec_data_X.npy'), axis=0)
y_data_r = np.load('vec_data_y.npy')

permut   = np.random.permutation(X_data_r.shape[0])

X_data_r = X_data_r[permut]
y_data_r = y_data_r[permut]

print(X_data_r.shape)
print(y_data_r.shape)

(50000, 6818)
(50000, 1)


## Correlation Matrix

This matrix was used to analyze the depence of the fist 29 features

In [13]:
df = pd.DataFrame({'Sentiment': y_data_r[:,0]})

df['wordCount_log' ]    = X_data_r[:,0 ]
df['count_!'       ]    = X_data_r[:,1 ]
df['positive_words']    = X_data_r[:,2 ]
df['negative_words']    = X_data_r[:,3 ]
df['pronouns1st'   ]    = X_data_r[:,4 ]
df['pronouns2nd'   ]    = X_data_r[:,5 ]
df['pronouns3rd'   ]    = X_data_r[:,6 ]
df['no_count'      ]    = X_data_r[:,7 ]
df['not_count'     ]    = X_data_r[:,8 ]
df['nor_neither'   ]    = X_data_r[:,9 ]

df['pos_neg'       ]    = X_data_r[:,10]

df['pos_pronoun1st']    = X_data_r[:,11]
df['pos_pronoun2nd']    = X_data_r[:,12]
df['pos_pronoun3rd']    = X_data_r[:,13]
df['pos_no'        ]    = X_data_r[:,14]
df['pos_not'       ]    = X_data_r[:,15]
df['pos_nor_neither']   = X_data_r[:,16]

df['neg_pronoun1st']    = X_data_r[:,17]
df['neg_pronoun2nd']    = X_data_r[:,18]
df['neg_pronoun3rd']    = X_data_r[:,19]
df['neg_no'        ]    = X_data_r[:,20]
df['neg_not'       ]    = X_data_r[:,21]
df['neg_nor_neither']   = X_data_r[:,22]

df['positive_words_2' ] = X_data_r[:,23]
df['negative_words_2' ] = X_data_r[:,24]
df['positive_words_3' ] = X_data_r[:,25]
df['negative_words_3' ] = X_data_r[:,26]
df['positive_words_sq'] = X_data_r[:,27]
df['negative_words_sq'] = X_data_r[:,28]

abs(df.corr()['Sentiment']).sort_values(ascending=False)

Sentiment            1.000000
negative_words_sq    0.261996
positive_words_sq    0.240795
positive_words       0.215529
negative_words       0.202699
positive_words_2     0.154977
no_count             0.145020
neg_no               0.115677
neg_pronoun1st       0.114881
negative_words_2     0.111458
positive_words_3     0.104311
pos_pronoun3rd       0.097813
neg_pronoun2nd       0.096465
neg_not              0.072669
negative_words_3     0.059022
pos_pronoun1st       0.057093
not_count            0.053531
pos_not              0.048238
nor_neither          0.045743
pronouns3rd          0.044133
neg_nor_neither      0.041784
neg_pronoun3rd       0.039384
pos_pronoun2nd       0.033470
pronouns2nd          0.032456
pos_no               0.031448
pronouns1st          0.030964
count_!              0.013676
pos_nor_neither      0.012478
pos_neg              0.005169
wordCount_log        0.001010
Name: Sentiment, dtype: float64

## Data Set Split

In [14]:
X_data_train = X_data_r[:40000]
y_data_train = y_data_r[:40000]

X_data_valid = X_data_r[40000:45000]
y_data_valid = y_data_r[40000:45000]

X_data_test  = X_data_r[45000:]
y_data_test  = y_data_r[45000:]

print(X_data_train.shape, y_data_train.shape)
print(X_data_valid.shape, y_data_valid.shape)
print(X_data_test.shape, y_data_test.shape)

(40000, 6818) (40000, 1)
(5000, 6818) (5000, 1)
(5000, 6818) (5000, 1)


We create a simple class to train the logistic regression with L1 and L2 regularization

In [15]:
class log_regr:
    def __init__ (self, W, b, alpha):
        self.W       = W
        self.b       = b
        self.alpha   = alpha
        self.l2_coef = 5e-8
        self.l1_coef = 5e-8
    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))
    
    def pred(self, X):
        return self.sigmoid(np.matmul(X, self.W) + self.b)
    
    def acc(self, X, y):
        pred_lim = self.pred(X) > 0.5
        return np.mean(pred_lim == y)
    
    def cross_entropy(self, X, y):
        return - np.mean(np.multiply(y  , np.log(self.pred(X)     + 1e-8))
                       + np.multiply(1-y, np.log(1 - self.pred(X) + 1e-8))) + \
                    self.l2_coef * np.mean(np.square(self.W)) / 2 + \
                    self.l1_coef * np.mean(np.abs(self.W))
    
    def fit(self, X, y):
        self.W = self.W - self.alpha * np.matmul(np.transpose(X), self.pred(X) - y) / X.shape[0] - \
                          self.alpha * self.l2_coef * self.W - \
                          self.alpha * self.l1_coef * np.multiply(self.W, 1 / np.abs(self.W))
        self.b = self.b - self.alpha * np.mean(self.pred(X) - y)  

## Training

In [17]:
W_ = np.random.normal(0, 0.01, (FEATURES,1))
b_ = 0.0
lr = 0.5

regr  = log_regr(W_, b_, lr)

BATCH = 10
STEP  = 1
fmt   = '{:6d} epoch , {:9.2f} docs/s, loss = {:7.6f}, loss_val = {:7.6f}, acc = {:4.3f}'
for i in range(100):
    loss_t  = 0.0
    n_batch = X_data_train.shape[0]/BATCH
    st = time.time()
    for i_batch in range(0,X_data_train.shape[0], BATCH):
        X_batch = X_data_train[i_batch:i_batch+BATCH]
        y_batch = y_data_train[i_batch:i_batch+BATCH]
        regr.fit(X_batch , y_batch)
        loss_t  += regr.cross_entropy(X_batch, y_batch)
    dt   = time.time() - st
        
    if (i+1)%STEP == 0:
        print(fmt.format((i+1), (X_data_train.shape[0])/dt,
                                      loss_t/n_batch, 
                                      regr.cross_entropy(X_data_valid, y_data_valid),
                                      regr.acc(X_data_valid, y_data_valid)))


     1 epoch ,  19956.20 docs/s, loss = 0.673469, loss_val = 0.675154, acc = 0.560
     2 epoch ,  19481.36 docs/s, loss = 0.655189, loss_val = 0.658764, acc = 0.665
     3 epoch ,  18150.50 docs/s, loss = 0.638594, loss_val = 0.643865, acc = 0.731
     4 epoch ,  13625.10 docs/s, loss = 0.623479, loss_val = 0.630275, acc = 0.773
     5 epoch ,  13197.06 docs/s, loss = 0.609659, loss_val = 0.617837, acc = 0.794
     6 epoch ,  18857.97 docs/s, loss = 0.596974, loss_val = 0.606410, acc = 0.807
     7 epoch ,  17514.24 docs/s, loss = 0.585286, loss_val = 0.595876, acc = 0.816
     8 epoch ,  19866.07 docs/s, loss = 0.574477, loss_val = 0.586133, acc = 0.823
     9 epoch ,  20145.87 docs/s, loss = 0.564448, loss_val = 0.577092, acc = 0.827
    10 epoch ,  20189.90 docs/s, loss = 0.555113, loss_val = 0.568678, acc = 0.832
    11 epoch ,  20142.92 docs/s, loss = 0.546397, loss_val = 0.560826, acc = 0.837
    12 epoch ,  20048.99 docs/s, loss = 0.538238, loss_val = 0.553480, acc = 0.840
    

   100 epoch ,  12661.78 docs/s, loss = 0.340877, loss_val = 0.385846, acc = 0.860


In [18]:
print('acc = {:6.5f}'.format(regr.acc(X_data_test, y_data_test)))

acc = 0.85860
