In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import SnowballStemmer,WordNetLemmatizer

In [5]:
import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [6]:
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
pd.set_option('max_colwidth',400)

In [31]:
train_raw = pd.read_csv('./movie-review-sentiment-analysis-kernels-only/train.tsv', sep='\t')
test_raw = pd.read_csv('./movie-review-sentiment-analysis-kernels-only/test.tsv', sep='\t')

In [32]:
train_raw.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [12]:
def clean_review(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review=str(review_col[i])
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus

In [17]:
train = train_raw
test = test_raw
train['text']=clean_review(train.Phrase.values)
test['text']=clean_review(test.Phrase.values)

In [18]:
y = train['Sentiment']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(train.text.values,y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [20]:
vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1)
full_text = list(train['text'].values)
vectorizer.fit(full_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [21]:
X_train_tfv =  vectorizer.transform(X_train)
X_test_tfv = vectorizer.transform(X_test)
test_tfv = vectorizer.transform(test['text'].values)

In [22]:
lr = LogisticRegression(C=1.0)
lr.fit(X_train_tfv, y_train)
predictions1 = lr.predict(X_test_tfv)
print("accuracy_score",accuracy_score(y_test, predictions1))



accuracy_score 0.6438549275919518


In [23]:
svc = LinearSVC()
svc.fit(X_train_tfv, y_train)
predictions2 = svc.predict(X_test_tfv)
print("accuracy_score",accuracy_score(y_test, predictions2))

pred2 = svc.predict(test_tfv)

pred_res2 = pd.DataFrame()
pred_res2['PhraseId'] = test['PhraseId']
pred_res2['Sentiment'] = pd.DataFrame(pred2)

pred_res2.to_csv('pred_res2.csv',index = False)

accuracy_score 0.6638472382417019


In [24]:
pred_res2

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,3
3,156064,2
4,156065,2
5,156066,3
6,156067,3
7,156068,2
8,156069,3
9,156070,2


## LSTM

In [28]:
import tensorflow as tf
from keras import backend

In [29]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [27]:
tf.set_random_seed(1)
np.random.seed(1)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

In [33]:
NUM_FOLDS = 5
train_raw["fold_id"] = train_raw["SentenceId"].apply(lambda x: x%NUM_FOLDS)

In [34]:
vocab_size = 20000  # based on words in the entire corpus
max_len = 60        # based on word count in phrases

all_corpus   = list(train_raw['Phrase'].values) + list(test_raw['Phrase'].values)
train_phrases  = list(train_raw['Phrase'].values) 
test_phrases   = list(test_raw['Phrase'].values)
X_train_target_binary = pd.get_dummies(train_raw['Sentiment'])

In [35]:
#Vocabulary-Indexing of thetrain and test phrases, make sure "filters" parm doesn't clean out punctuations

tokenizer = Tokenizer(num_words=vocab_size, lower=True, filters='\n\t')
tokenizer.fit_on_texts(all_corpus)
encoded_train_phrases = tokenizer.texts_to_sequences(train_phrases)
encoded_test_phrases = tokenizer.texts_to_sequences(test_phrases)

In [36]:
#Watch for a POST padding, as opposed to the default PRE padding

X_train_words = sequence.pad_sequences(encoded_train_phrases, maxlen=max_len,  padding='post')
X_test_words = sequence.pad_sequences(encoded_test_phrases, maxlen=max_len,  padding='post')
print (X_train_words.shape)
print (X_test_words.shape)
print (X_train_target_binary.shape)

print ('Done Tokenizing and indexing phrases based on the vocabulary learned from the entire Train and Test corpus')

(156060, 60)
(66292, 60)
(156060, 5)
Done Tokenizing and indexing phrases based on the vocabulary learned from the entire Train and Test corpus


In [37]:
word_index = tokenizer.word_index
embeddings_index = {}
embedding_size = 300