In [19]:
import nltk
import pandas as pd
import re
import numpy
import sklearn
import gensim
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import csr_matrix

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")


# read csv files
test = pd.read_csv('test.csv');
train = pd.read_csv('train.csv');


In [20]:
# get sentiment distribution
sentiment_distribution = train['Sentiment'].describe();
sentiment_distribution

count    1.048575e+06
mean     2.370598e-01
std      4.252795e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: Sentiment, dtype: float64

In [21]:
# language preprocessing step
corpus = []
labels = []
index = 0
for review in train['Text']:
    sentences = sent_tokenize(review)
    for sentence in sentences:
        # lower casing
        sentence = sentence.lower()
        # remove numbers
        sentence = re.sub('[0-9]', "", sentence)
        # expand contractions
        sentence = re.sub(r"won\'t", " will not", sentence)
        sentence = re.sub(r"can\'t", " can not", sentence)
        sentence = re.sub(r"n\'t", " not", sentence)
        sentence = re.sub(r"\'re", "are", sentence)
        sentence = re.sub(r"\'s", " is", sentence)
        sentence = re.sub(r"\'d", " would", sentence)
        sentence = re.sub(r"\'ll", " will", sentence)
        sentence = re.sub(r"\'t", " not", sentence)
        sentence = re.sub(r"\'ve", " have", sentence)
        sentence = re.sub(r"\'m", " am", sentence)
        # remove special characters and puncuation
        sentence = re.sub("[^a-z0-9<>]", ' ', sentence)
        # remove stop words
        sentence = word_tokenize(sentence)
        # stemming
        temp_sentence = []
        for word in sentence:
            stemmed_word = stemmer.stem(word)
            temp_sentence.append(stemmed_word)
        corpus.append(temp_sentence)
        labels.append(train['Sentiment'][index])
    index += 1

In [None]:
# linguistic feature extraction

# bag of words
word2count = {}
for sentence in corpus:
    for word in sentence:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

unique_words = list(word2count.keys())

bag_of_words = []

for sentence in corpus:
    bag_vector = numpy.zeros(len(unique_words))
    for word in sentence:
        for j, w in enumerate(unique_words):
            if w == word:
                bag_vector[j] += 1
    bag_of_words.append(bag_vector)
    
bag_of_words_str = []
for bag_vector in bag_of_words:
    bag_of_words_str.append(" ".join([str(int(count)) for count in bag_vector]))

# TF*IDF
corpus_str = [' '.join(sentence) for sentence in corpus]
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, use_idf=True)
tfidf_vectorizer.fit(corpus_str)
tfidf_matrix = tfidf_vectorizer.transform(corpus_str)

# bag of words vectorizer
bag_vectorizer = CountVectorizer()
bag_matrix = bag_vectorizer.fit_transform(corpus_str)

In [None]:
# Word 2 Vec
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4, epochs=1)
model.save("word2vec.model")

word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [18]:
# model training

from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


# bag of words
X_train, X_test, y_train, y_test = train_test_split(bag_matrix, labels, test_size=0.2, random_state=42)

dense_X_train = csr_matrix(X_train).toarray()
dense_X_test = csr_matrix(X_test).toarray()

lr_model = LogisticRegression(max_iter=1000)
svc_model = SVC(probability=True)
nbc_model = GaussianNB()
rfc_model = RandomForestClassifier()

lr_model.fit(dense_X_train, y_train)
svc_model.fit(dense_X_train, y_train)
nbc_model.fit(dense_X_train, y_train)
rfc_model.fit(dense_X_train, y_train)

accuracy = lr_model.score(dense_X_test, y_test)
print("Logistical Regression Accuracy:", accuracy)
accuracy = svc_model.score(dense_X_test, y_test)
print("SVC Accuracy:", accuracy)
accuracy = nbc_model.score(dense_X_test, y_test)
print("Naive Bayes Accuracy:", accuracy)
accuracy = rfc_model.score(dense_X_test, y_test)
print("Random Forest Classifier Accuracy:", accuracy)
print('\n')

# TF IDF
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.2, random_state=42)

dense_X_train = csr_matrix(X_train).toarray()
dense_X_test = csr_matrix(X_test).toarray()

lr_model = LogisticRegression(max_iter=1000)
svc_model = SVC(probability=True)
nbc_model = GaussianNB()
rfc_model = RandomForestClassifier()

lr_model.fit(dense_X_train, y_train)
svc_model.fit(dense_X_train, y_train)
nbc_model.fit(dense_X_train, y_train)
rfc_model.fit(dense_X_train, y_train)

accuracy = lr_model.score(dense_X_test, y_test)
print("Logistical Regression Accuracy:", accuracy)
accuracy = svc_model.score(dense_X_test, y_test)
print("SVC Accuracy:", accuracy)
accuracy = nbc_model.score(dense_X_test, y_test)
print("Naive Bayes Accuracy:", accuracy)
accuracy = rfc_model.score(dense_X_test, y_test)
print("Random Forest Classifier Accuracy:", accuracy)
print('\n')

# Word 2 vector

dense_X = numpy.zeros((len(corpus), 100))
for i, sentence in enumerate(corpus):
    vector_sum = numpy.zeros(100)
    for word in sentence:
        if word in model.wv:
            vector_sum += model.wv[word]
    dense_X[i] = vector_sum
    
X_train, X_test, y_train, y_test = train_test_split(dense_X, labels, test_size=0.2, random_state=42)

dense_X_train = csr_matrix(X_train).toarray()
dense_X_test = csr_matrix(X_test).toarray()

lr_model = LogisticRegression(max_iter=1000)
svc_model = SVC(probability=True)
nbc_model = GaussianNB()
rfc_model = RandomForestClassifier()

lr_model.fit(dense_X_train, y_train)
svc_model.fit(dense_X_train, y_train)
nbc_model.fit(dense_X_train, y_train)
rfc_model.fit(dense_X_train, y_train)

accuracy = lr_model.score(dense_X_test, y_test)
print("Logistical Regression Accuracy:", accuracy)
accuracy = svc_model.score(dense_X_test, y_test)
print("SVC Accuracy:", accuracy)
accuracy = nbc_model.score(dense_X_test, y_test)
print("Naive Bayes Accuracy:", accuracy)
accuracy = rfc_model.score(dense_X_test, y_test)
print("Random Forest Classifier Accuracy:", accuracy)
print('\n')


Logistical Regression Accuracy: 0.6853146853146853
SVC Accuracy: 0.6923076923076923
Naive Bayes Accuracy: 0.7062937062937062
Random Forest Classifier Accuracy: 0.6573426573426573


Logistical Regression Accuracy: 0.7202797202797203
SVC Accuracy: 0.6783216783216783
Naive Bayes Accuracy: 0.6643356643356644
Random Forest Classifier Accuracy: 0.6993006993006993


Logistical Regression Accuracy: 0.48951048951048953
SVC Accuracy: 0.5734265734265734
Naive Bayes Accuracy: 0.5034965034965035
Random Forest Classifier Accuracy: 0.5034965034965035


