In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
import nltk
from nltk.stem import *
from nltk.tag import *
from nltk import *
from nltk.tokenize import *
%pylab inline

pd.set_option('display.max_columns', 999)
pd.set_option('display.column_space', 50)
pd.set_option('display.max_rows',1000)

set_matplotlib_formats('retina')
plt.style.use('ggplot')


# gensim modules# gensi 
import gensim
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [20]:
df = pd.read_table('train.tsv')
test = pd.read_table('test.tsv')

df.columns = df.columns.str.lower()
test.columns = test.columns.str.lower()

# Manual Feature Engineering

In [21]:

wanted_tags = ['NNP', 'NNPS'] #NNP stands for Proper Nouns


## Create NNP based on pos tags. NNP = proper nouns
df['NNP'] = df.phrase.apply(lambda x: ' '.join(['NNP' if pos in wanted_tags else word for word, pos in pos_tag(word_tokenize(x))]))
'-----------------------------------------'
test['NNP'] = test.phrase.apply(lambda x: ' '.join(['NNP' if pos in wanted_tags else word for word, pos in pos_tag(word_tokenize(x))]))


##filter out duplicate NNP
df['NNP'] = df.NNP.str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')
'-----------------------------------------'
test['NNP'] = test.NNP.str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')


## Turn NNPs into NNP
df.NNP = df.NNP.str.replace('NNPs','NNP')
'-------------------------------------------------------------'
test.NNP = test.NNP.str.replace('NNPs','NNP')



In [22]:

## replace certain words and get rid of unnecessary words or spaces 
df.NNP = df.NNP.str.replace(" '", "") 
df.NNP = df.NNP.str.replace(" n't", "n't")
df.NNP = df.NNP.str.replace("-LRB-", "")
df.NNP = df.NNP.str.replace("-RRB-", "")
df.NNP = df.NNP.str.replace(".", "")
'-----------------------------------------'
test.NNP = test.NNP.str.replace(" '", "") 
test.NNP = test.NNP.str.replace(" n't", "n't")
test.NNP = test.NNP.str.replace("-LRB-", "")
test.NNP = test.NNP.str.replace("-RRB-", "")
test.NNP = test.NNP.str.replace(".", "")



# turn `` into "" because there are movie names like `` Analyze That '' which in order for regex to find Analyze That (name of movie), you need to turn `` into ""
df.NNP = df.NNP.str.replace("``", '"')
'-----------------------------------------'
test.NNP = test.NNP.str.replace("``", '"')




# Turn ' Analyze That (movie name) ' into NNP
df['NNP'] = df.NNP.str.replace(r'"(.*?)"', 'NNP')
'-----------------------------------------'
test['NNP'] = test.NNP.str.replace(r'"(.*?)"', 'NNP')


# Get rid of duplicate NNPs again
df['NNP'] = df.NNP.str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')
'-----------------------------------------'
test['NNP'] = test.NNP.str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')




### Pos Tag didn't take filter all of the proper nouns. The code below is me manually getting all the named entity through this following condition: if first word is capitalized and the next word is capitalized, and so is the next on and so on, all these words become Named Entity Recognition 

In [23]:


namedent = set()

for i in df.index:
    namedent_words = []
    sentence = df.NNP[i].split()

    for index, word in enumerate(sentence):
        next_index = index + 1
        if word.istitle():
            if index < len(sentence) -1:
                if sentence[next_index].istitle():
                    if sentence[index] not in namedent_words:
                        namedent_words.append(sentence[index])
                    if sentence[next_index] not in namedent_words:
                        namedent_words.append(sentence[next_index])
                        
    if namedent_words:
        namedent_joined = ' '.join(namedent_words)
        namedent.add(namedent_joined)

'------------------------------------------------------------------------------------'

test_namedent = set()

for i in test.index:
    test_namedent_words = []
    sentence = test.NNP[i].split()

    for index, word in enumerate(sentence):
        next_index = index + 1
        if word.istitle():
            if index < len(sentence) -1:
                if sentence[next_index].istitle():
                    if sentence[index] not in test_namedent_words:
                        test_namedent_words.append(sentence[index])
                    if sentence[next_index] not in test_namedent_words:
                        test_namedent_words.append(sentence[next_index])
                        
    if test_namedent_words:
        test_namedent_joined = ' '.join(test_namedent_words)
        test_namedent.add(test_namedent_joined)
        
        
                        

In [24]:
# Change leftover Proper nouns into NNP
df.NNP = df.NNP.apply(lambda x: 'NNP' if x in namedent else x)
'-------------------------------------------------------------'
test.NNP = test.NNP.apply(lambda x: 'NNP' if x in test_namedent else x)



In [25]:
# Get rid of punctuations

df.NNP = df.NNP.apply(lambda x: re.sub(r'\s([?.!,''"](?:\s|$))', r'\1', x))
'----------------------------------------------------------------------------------'
test.NNP = test.NNP.apply(lambda x: re.sub(r'\s([?.!,''"](?:\s|$))', r'\1', x))


df.NNP = df.NNP.str.replace(",", "")
'-----------------------------------------'
test.NNP = test.NNP.str.replace(",", "")

In [26]:
# Create a new column for tokenized NNP-ed words for faster processing (such as for Stanford's NER taging)

df['NNP_tokens'] = df.NNP.apply(lambda x: word_tokenize(x.lower()))
df['NNP'] = df.NNP.apply(lambda x: x.lower())



In [27]:
df.NNP.to_csv('rt_review.txt', header=None, index=None, sep=' ', mode='a')
test.NNP.to_csv('test_rt_review.txt', header=None, index=None, sep=' ', mode='a')

In [28]:
df.to_csv('df_train.csv')
test.to_csv('df_test.csv')

# Word2Vec / Doc2Vec

In [7]:

class LabeledLineSentence (object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [3]:
sources = {'rt_review.txt':'review', 'test_rt_review.txt':'test_review'}

sentences = LabeledLineSentence(sources)

In [6]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



TypeError: unhashable type: 'list'

In [13]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs= 10)

In [14]:
model.save('doc2vecmodel')

In [46]:
gensim_model = gensim.models.Doc2Vec.load('doc2vecmodel')

In [47]:
gensim_model.most_similar('nice')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('good', 0.7179684042930603),
 ('neat', 0.6981481313705444),
 ('cool', 0.6815207004547119),
 ('great', 0.6599352359771729),
 ('interesting', 0.5582364201545715),
 ('little', 0.5564978718757629),
 ('fine', 0.546832263469696),
 ('decent', 0.5333112478256226),
 ('also', 0.5299546718597412),
 ('plus', 0.5282900929450989)]

In [28]:
train_arrays = numpy.zeros((25000, 100))
train_labels = numpy.zeros(25000)

In [48]:
gensim_model['TRAIN_NEG_0'].shape

(100,)

In [30]:
for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [40]:
test_arrays = numpy.zeros((25000, 100))
test_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [41]:
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
classifier.score(test_arrays, test_labels)

0.84696

# Gensim with RT 

In [3]:
sources = {'rt_review.txt':'reviews'}

sentences = LabeledLineSentence(sources)

In [4]:
sentences

<__main__.LabeledLineSentence at 0x10f0586d8>

In [34]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



In [35]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs= 10)

In [36]:
model.save('rt_doc2vecmodel')

In [5]:
model = Doc2Vec.load('rt_doc2vecmodel')

In [6]:
model.most_similar('good')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('intentions', 0.5827995538711548),
 ('very', 0.5787571668624878),
 ('"good', 0.5731457471847534),
 ('"does', 0.560712456703186),
 ('bad', 0.5394617319107056),
 ('great', 0.537720799446106),
 ('for', 0.5320193767547607),
 ('ye', 0.5192350745201111),
 ('but', 0.5140767097473145),
 ('still', 0.5119826793670654)]

In [9]:
df = pd.read_csv('df_polarity.csv')

df.NNP.shape

(156060,)

In [10]:
train_arrays = np.zeros((df.NNP.shape[0], 100))
train_labels = np.array(df.sentiment)

In [11]:
for i in range(df.NNP.shape[0]):
    train_arrays[i] = model['reviews_' + str(i)]    

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.ensemble import *

x_train, x_test, y_train, y_test = train_test_split(train_arrays, train_labels, test_size = 0.2)


In [13]:
classifier = LogisticRegression(solver = 'newton-cg', multi_class= 'multinomial')
classifier.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
y_pred = classifier.predict(x_test)

In [15]:
accuracy_score(y_test, y_pred)

0.5363321799307958

In [91]:
rf = RandomForestClassifier()

rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

In [92]:
accuracy_score(y_test, y_pred)

0.48811354607202356

In [111]:
import gensim
import numpy as np 
import pandas as pd
from nltk.corpus import stopwords
# from src.word_embedding_utils_v2 import clean_corpus, build_w2id_dict, tokenize_text_data
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence,text
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, Sequential
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from gensim.models import Doc2Vec

from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.optimizers import Adam

In [9]:
dev_set_ids = np.loadtxt('word2vec_rtreview.txt', dtype=float)

In [10]:
pd.DataFrame(dev_set_ids).to_pickle('rtreview_word2vec.pkl')

In [14]:
w2vec_model = Doc2Vec.load('rt_doc2vecmodel')

In [15]:
vocabulary = w2vec_model.wv.vocab

In [16]:
# data=pad_sequences(dev_set_ids, maxlen= 100, padding='post', truncating='post', value=0)
df = pd.read_csv('df_polarity.csv')
labels = pd.get_dummies(df.sentiment)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(dev_set_ids, labels, 
                                                                    test_size=0.2)

In [1]:
def get_wv_matrix(genism_model):
    #build the np matrix
    embedding_shape = (len(genism_model.wv.vocab), genism_model.trainables.layer1_size)
    embedding_matrix = np.zeros(embedding_shape)

    #insert the data from model:
    for index in range(len(genism_model.wv.vocab)):
        embedding_vector = genism_model.wv[genism_model.wv.index2word[index]]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    return(embedding_matrix)


embedding_matrix = get_wv_matrix(w2vec_model)

embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                    weights=[embedding_matrix], input_length = 100)



#great now lets run the CNN
cnn_test = Sequential()

#first conv layer + max pool
cnn_test.add(embeddings)
cnn_test.add(Conv1D(filters=100, kernel_size = 5, activation = 'relu', strides = 1))
cnn_test.add(MaxPooling1D(5))

#2nd conv layer + max pooling
cnn_test.add(Conv1D(filters =100, kernel_size = 2, activation='relu'))
cnn_test.add(MaxPooling1D(5))

#flatten and then connect
cnn_test.add(Flatten())
cnn_test.add(Dense(256, activation = 'relu'))

#output layer with sigmoid activation
cnn_test.add(Dense(y_train.shape[1], activation = 'sigmoid'))

# Compile settings
print('\tcompiler settings complete!')
cnn_test.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

cnn_test.fit(x_train, y_train, validation_data=(x_validate, y_validate), epochs = 5, batch_size= 1000, verbose=2)
cnn_test.save('cnn_model.h5')

In [2]:
from keras.models import load_model
model = load_model('cnn_model.h5')


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [34]:
pred = model.predict(x_test, batch_size = 1000, verbose = 1)



In [31]:
predictions = np.argmax(pred, axis=1).astype(int)

# LSTM

In [132]:
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

# df_train = df_train[df_train.NNP.notnull()]
# df_test = df_test[df_test.NNP.notnull()]

df_train = df_train.fillna('.')
df_test = df_test.fillna('.')

train_text = df_train.NNP.values
test_text = df_test.NNP.values

In [133]:
target=df_train.sentiment.values
y=to_categorical(target)

In [134]:
X_train_text,X_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.2,stratify=y)
print(X_train_text.shape,y_train.shape)
print(X_val_text.shape,y_val.shape)

(124848,) (124848, 5)
(31212,) (31212, 5)


In [135]:
all_words=' '.join(X_train_text)
all_words=word_tokenize(all_words)
dist=FreqDist(all_words)
num_unique_word=len(dist)
num_unique_word

15950

In [136]:
r_len=[]
for text in X_train_text:

    word=word_tokenize(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

49

In [137]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=5

In [138]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(test_text)

In [107]:
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
print(X_train.shape,X_val.shape,X_test.shape)

(124843, 48) (31211, 48) (66286, 48)


In [112]:
model1=Sequential()
model1.add(Embedding(max_features,100,mask_zero=True))
model1.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model1.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model1.add(Dense(num_classes,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         1593700   
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 64)          42240     
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 1,648,521
Trainable params: 1,648,521
Non-trainable params: 0
_________________________________________________________________


In [114]:
%%time
history1=model1.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=1, batch_size=batch_size, verbose=1)

Train on 124843 samples, validate on 31211 samples
Epoch 1/1
CPU times: user 8min 18s, sys: 59.7 s, total: 9min 18s
Wall time: 3min 30s


In [122]:
y_pred =model1.predict_classes(X_test,verbose=1)




In [123]:
model1.save('lstm_model.h5')

In [125]:
y_pred.shape

(66286,)

In [126]:
submission.shape

(66292, 2)

In [148]:

# submission = pd.read_csv('sampleSubmission.csv')

# submission.Sentiment=y_pred
# submission.to_csv('submission.csv',index=False)
# x

In [141]:
sub = pd.read_csv('submission.csv')