In [1]:
import numpy as np
import pandas as pd
# from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

# Read the data

In [2]:
path = '../../../datasets/objective_subjective/'
subjective_file = 'quote.tok.gt9.5000'

In [3]:
f = open(path + subjective_file, 'r', encoding='utf-8', errors='ignore')
subjective_lines = []
for line in f:
    subjective_lines.append(line)
f.close()

print('Subjective:',len(subjective_lines))
subjective_lines[0]

Subjective: 5000


'smart and alert , thirteen conversations about one thing is a small gem . \n'

In [4]:
objective_file = 'plot.tok.gt9.5000'
f = open(path + objective_file, 'r', encoding='utf-8', errors='ignore')
objective_lines = []
for line in f:
    objective_lines.append(line.rstrip())
f.close()

print('Objective:',len(objective_lines))

Objective: 5000


In [5]:
objective_lines[0]

'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter'

In [6]:
all_lines = objective_lines + subjective_lines
print(len(all_lines))
all_lines[0]

10000


'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter'

## Create tf_idf = f(sentence,term)

In [7]:
tfidf = TfidfVectorizer(max_df=0.99) # at 0.99, same result as 1: 
sentence_tfidf = tfidf.fit_transform(all_lines)
vocab = tfidf.vocabulary_
if False:
    tfidf_mat = sentence_tfidf
else:
    tfidf_mat = sentence_tfidf.todense()
print(type(tfidf_mat))
print(len(vocab))

<class 'numpy.matrix'>
20893


In [18]:
pickle.dump(tfidf, open('fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl', 'wb'))

In [8]:
sorted(vocab)[258:268]

['93',
 '94',
 '95',
 '996',
 '_boogie',
 'aaa',
 'aaliyah',
 'abandon',
 'abandone',
 'abandoned']

In [9]:
tfidf_mat[:2,270:300]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## Create Train and CV sets

In [10]:
labels = np.concatenate((np.zeros((5000,), dtype=int), np.ones((5000,), dtype=int)))
print(labels.shape)
labels[4990:5010]

(10000,)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(
    tfidf_mat, labels, test_size=0.2, random_state=0)

In [12]:
print(X_train.shape, X_cv.shape, y_train.shape, y_cv.shape)

(8000, 20893) (2000, 20893) (8000,) (2000,)


## Gradient Boosting

In [13]:
# N_TREES = 300
# LEARN_RATE = 0.2
# MIN_IN_LEAF = 10
# CV score: 0.8285

# 107 trees
# LEARN_RATE = 0.2
# MAX_DEPTH = 8
# MIN_IN_LEAF = 5 #7
# MAX_FEATURES = 'sqrt'
# CV score: 0.8415

# Gradient Boosting Classifier parameters
# N_TREES = int(round(np.sqrt(X_train.shape[0]) * 1.2))
# CV score should be 0.8815
N_TREES = 300
LEARN_RATE = 0.5
MAX_DEPTH = 16
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'
N_TREES

300

In [14]:
print(y_train[0])
print(X_train[0])


1
[[0. 0. 0. ... 0. 0. 0.]]


In [15]:
from sklearn.ensemble import GradientBoostingClassifier

# loss: deviance: logistic log likelihood
gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                   n_estimators=N_TREES, 
                                   min_samples_leaf=MIN_IN_LEAF,
                                   max_features=MAX_FEATURES)
gbc.fit(X_train, y_train)
print('Train score:', gbc.score(X_train, y_train))
print('CV score:', gbc.score(X_cv, y_cv))

Train score: 0.96025
CV score: 0.884


In [None]:
if True:
    grid = {
        'learning_rate': [.01, .05],
        'max_depth': [8, 16],
        'min_samples_leaf': [5],
        'max_features': ['sqrt'],
        'n_estimators': [300],
        'random_state': [0]
    }
else:  # TEST
    grid = {
    'learning_rate': [1],
    'max_depth': [2], 
    'min_samples_leaf': [2],
#     'max_features': ['sqrt', None],
    'n_estimators': [2],
    'random_state': [0]
}
    
# confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

gbc_grid_cv = GridSearchCV(
    GradientBoostingClassifier(), 
    grid,
    cv=4,  # number of folds
    return_train_score=True,
    verbose=1, 
    n_jobs=-1)
gbc_grid_cv.fit(X_train, y_train)

In [None]:
y_pred = gbc_grid_cv.predict(X_cv)

print(gbc_grid_cv.best_params_)
print(gbc_grid_cv.best_score_)
res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
res_df

In [16]:
# save the model
import pickle
pickle.dump(gbc, open('GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'wb'))


# LSTM

In [19]:
print(len(all_lines))
all_lines[0]

10000


'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter'

### Find the max length of comments

In [21]:
import sys
sys.path.append('..')

In [22]:
from NLP import WordBag

In [None]:
%reload_ext autoreload
word_bag = WordBag()

# Split input into real sentences
Drop the idea, it's already OK the way it is!

In [30]:
from nltk.tokenize import sent_tokenize

In [36]:
# sentence_lists = np.array([sent_tokenize(line) for line in all_lines])
# sentences = sentence_lists.flatten()
# len(sentences)
# sentences.shape

(10000,)

In [40]:
# sentence_lists[0]

['the movie begins in the past where a young boy named sam attempts to save celebi from a hunter']

In [37]:
# sentences[0]

['the movie begins in the past where a young boy named sam attempts to save celebi from a hunter']

# Finding
Obj subj sentences not clean, include multiple sentences

In [44]:
for i, text in enumerate(all_lines):
    if len(text.split()) > 100:
        print(i, text)

357 boromir ( sean bean ) is dead , frodo baggins ( elijah wood ) and samwise gamgee ( sean astin ) have gone to mordor alone to destroy the one ring , merry ( dominic monaghan ) and pippin ( billy boyd ) have been captured by the uruk-hai , and aragorn ( viggo mortensen ) , legolas ( orlando bloom ) , and gimli ( john rhys-davies ) have made friends of the rohan , a race of humans that are in the path of the upcoming war , led by its aging king , th&#233 ; oden ( bernard hill ) .
4109 based on the gruesome novella clive barker wrote especially for todd mcfarlane's disturbing action figure line , tortured souls is the story of six cenobite-like creatures : agonistes , a transformer of people into monsters who may be a creation of god ; the scyther-meister , an assassin created by agonistes to bring a corrupt empire to its knees ; lucidique , the daughter of a slain senator who is strangely falling in love with the scythe-meister ; talisac , a twisted doctor who has impregnated and atta

In [41]:
for i, text in enumerate(sentences):
    if len(text[0].split()) > 60:
        print(i, text)

81 ["with his own group , the urban cyber breakers , he has got a dream : to win the dance battle that will bring his team to the world final in new york before reaching usa and fame , he has to overcome the rival group , the ld , to deal with his mother's excessive love and to live out his passion for the beautiful samia ."]
169 ['the film captures intense footage of several art events as well as interviews with burning man organizers larry harvey ( also co-founder ) , crimson rose and will roger who explain how , what started in 1986 as an impromptu summer solstice celebration , put on by a few friends at a beach in san francisco , has turned into a full-blown exercise in city planning and management with a hefty price-tag .']
243 [' " bums in the mist " will expose you to a world where alcohol is king and sloth is a virtue ; buddy george will discover , bums who are able to find love and professional success all while maintaining their " mondern frontiersman " lifestyle ; bums who\'

In [25]:
max([len(text.split()) for text in all_lines])

120

### Define & run LSTM

In [46]:
WORD_WINDOW = 50

import tensorflow as tf
keras = tf.keras
lstm = keras.Sequential()
lstm.add(keras.layers.LSTM(32, input_shape=(WORD_WINDOW, 1), return_sequences=True))
lstm.add(keras.layers.LSTM(32, return_sequences=False))
lstm.add(keras.layers.Dense(1, activation='linear'))
lstm.compile(optimizer='rmsprop',
              loss='mse')

In [None]:
X_train, X_cv, y_train, y_cv = train_test_split(
    all_lines, labels, test_size=0.2, random_state=0)

In [None]:
model.fit(x_train, y_train, batch_size=32, epochs=10)

## TODO
### Use emotions vectors as input to model
### Use emotions to inspect input and re-label if needed
### See if remove highest frequency words?