In [1]:
import numpy as np
import pandas as pd
# from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from utils import rmse, rmse_train_cv

In [2]:
!ls {subjective_file}

ls: {subjective_file}: No such file or directory


# Read the data

In [3]:
path = '../../datasets/objective_subjective/'
subjective_file = 'quote.tok.gt9.5000'
f = open(path + subjective_file, 'r', encoding='utf-8', errors='ignore')
subjective_lines = []
for line in f:
    subjective_lines.append(line)
f.close()

print('Subjective:',len(subjective_lines))
subjective_lines[0]

Subjective: 5000


'smart and alert , thirteen conversations about one thing is a small gem . \n'

In [4]:
objective_file = 'plot.tok.gt9.5000'
f = open(path + objective_file, 'r', encoding='utf-8', errors='ignore')
objective_lines = []
for line in f:
    objective_lines.append(line.rstrip())
f.close()

print('Objective:',len(objective_lines))

Objective: 5000


In [5]:
objective_lines[0]

'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter'

## Create tf_idf = f(sentence,term)

In [6]:
all_lines = objective_lines + subjective_lines
print(len(all_lines))
all_lines[0]

10000


'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter'

In [7]:
tfidf = TfidfVectorizer(max_df=0.99) # at 0.99, same result as 1: 
sentence_tfidf = tfidf.fit_transform(all_lines)
vocab = tfidf.vocabulary_
tfidf_mat = sentence_tfidf.todense()
print(type(tfidf_mat))
print(len(vocab))

<class 'numpy.matrixlib.defmatrix.matrix'>
20893


In [8]:
import pickle
pickle.dump(tfidf, open('pickles/Obj-Subj_tfidf.pkl', 'wb'))

In [9]:
sorted(vocab)[258:268]

['93',
 '94',
 '95',
 '996',
 '_boogie',
 'aaa',
 'aaliyah',
 'abandon',
 'abandone',
 'abandoned']

In [10]:
tfidf_mat[:2,270:300]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## Create Train and CV sets

In [12]:
labels = np.concatenate((np.zeros((5000,)), np.ones((5000,))))
print(labels.shape)
labels[4990:5010]

(10000,)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(
    tfidf_mat, labels, test_size=0.2, random_state=0)

In [14]:
print(X_train.shape, X_cv.shape, y_train.shape, y_cv.shape)

(8000, 20893) (2000, 20893) (8000,) (2000,)


## Gradient Boosting

In [15]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10

In [19]:
if True:
    pickle_in = open('pickles/GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                            +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
    gbc = pickle.load(pickle_in)
else:
    gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                   n_estimators=N_TREES, 
                                   min_samples_leaf=MIN_IN_LEAF)
    gbc.fit(X_train, y_train)

In [20]:
# loss: deviance: logistic log likelihood
print('Train score:',gbc.score(X_train, y_train))
print('CV score:',gbc.score(X_cv, y_cv))

Train score: 0.848375
CV score: 0.8285


In [21]:
%reload_ext autoreload
rmse_train_cv(gbc, X_train, X_cv, y_train, y_cv)

### RMSE for GradientBoostingClassifier :

#### Training: 0.389

#### Test:     0.414

In [None]:
# save the model
import pickle
# pickle.dump(model, open('GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
#                         +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'wb'))

## TODO
### TFIDF training on training set only
### Add stopwords
### Use emotions vectors as input to model
### Use emotions to inspect input and re-label if needed
### See if remove highest frequency words?