## Modules

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from modules import *

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\QYH\AppD

In [2]:
X,y=load_data('./data/labeledTrainData.tsv',colname=['review','sentiment'])

************** Loading Data ************


Summary of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB
No of Rows: 25000
No of Columns: 3

Data View: Last 3 Instances

              id  sentiment                                             review
24997  "10905_3"          0  "Guy is a loser. Can't get girls, needs to bui...
24998  "10194_3"          0  "This 30 minute documentary Buñuel made in the...
24999   "8478_8"          1  "I saw this movie as a child and it broke my h...

Class Counts(label, row): Total
1    12500
0    12500
Name: sentiment, dtype: int64

Data View: First 5 Instances

         id  sentiment                                             review
0  "5814_8"   

In [3]:
## get the list of sentences
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(raw_sentence)
    return sentences

sentences = []

# A tokenizer for dividing sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for review in X['review']:
    sentences+=review_to_sentences(review, tokenizer)
print(len(X))
print(len(sentences))

25000
266756


In [4]:
#preprocess
sentences = list(map(preprocess_data, sentences))
sentences

[['stuff',
  'go',
  'moment',
  'mj',
  'ive',
  'start',
  'listen',
  'music',
  'watch',
  'odd',
  'documentari',
  'watch',
  'wiz',
  'watch',
  'moonwalk'],
 ['mayb',
  'want',
  'get',
  'certain',
  'insight',
  'guy',
  'thought',
  'realli',
  'cool',
  'eighti',
  'mayb',
  'make',
  'mind',
  'whether',
  'guilti',
  'innoc'],
 ['moonwalk',
  'part',
  'biographi',
  'part',
  'featur',
  'film',
  'rememb',
  'go',
  'see',
  'cinema',
  'origin',
  'releas'],
 ['subtl',
  'messag',
  'mj',
  'feel',
  'toward',
  'press',
  'also',
  'obviou',
  'messag',
  'drug',
  'bad',
  'mkayvisu',
  'impress',
  'cours',
  'michael',
  'jackson',
  'unless',
  'remot',
  'like',
  'mj',
  'anyway',
  'go',
  'hate',
  'find',
  'bore'],
 ['may',
  'call',
  'mj',
  'egotist',
  'consent',
  'make',
  'movi',
  'mj',
  'fan',
  'would',
  'say',
  'made',
  'fan',
  'true',
  'realli',
  'nice',
  'himth',
  'actual',
  'featur',
  'film',
  'bit',
  'final',
  'start',
  'minut',

## word2vec

In [5]:
import time
from gensim.models import Word2Vec


# 训练模型
print("train word2vec...")
model = Word2Vec(sentences, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)

train word2vec...


In [None]:
#save model
model_name = "./model/word2vec_train_data.model"
model.save(model_name)

## Get the vector of X_data

In [7]:
#get review vector
def get_review_vector(review):
    global word_vec
    word_vec = np.zeros((1,250))
    for word in review:
        if word in model:
            word_vec = np.concatenate((word_vec, np.array([model[word]])), axis=0)
    return pd.Series(word_vec.mean(axis = 0))

In [8]:
#preprocess
X_train, X_test, y_train,  y_test=split_data(X,y)
X_train=X_train.iloc[:, -1].apply(preprocess_data)
X_test=X_test.iloc[:, -1].apply(preprocess_data)


************** Spliting Data **************


************** Data After Splitting **************

Train Data: (20000, 1)
Test Data: (5000, 1)

Class Counts(label, row): Train
1    10018
0     9982
Name: sentiment, dtype: int64

Class Counts(label, row): Test
0    2518
1    2482
Name: sentiment, dtype: int64

First 5 Instance: Train
                                                  review
6167   "Some unsuspecting films carry a message that ...
3101   "Even the first 10 minutes of this movie were ...
17307  "To me this was more a wake up call, and reali...
3950   "Shower keeps within itself in so many ways. A...
893    "Brian Yuzna is often frowned upon as a direct...

First 5 Instance: Test
                                                  review
7799   "\"Girlfight\" is much more of a coming-of-age...
4427   "This movie will go down down in history as on...
14941  "I have to agree with Cal-37 it's a great movi...
11644  "Most of the Atomic Age monster movies I saw o...
15548  "I saw 

In [9]:
X_train_features = X_train.apply(get_review_vector)
X_train_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0.225553,-0.044139,-0.02904,-0.086288,-0.020724,0.142908,0.010598,-0.050429,-0.052924,-0.086983,...,0.060707,0.019001,0.098438,0.235849,0.13113,0.152965,0.008439,0.053059,0.031083,0.031538
1,0.181758,-0.053684,-0.040484,-0.046043,-0.052555,0.117074,-0.052912,0.017348,0.034722,-0.102567,...,-0.003774,-0.051402,0.11879,0.200897,0.138513,0.143907,-0.0261,0.011829,0.071819,0.040848
2,0.271134,-0.063985,-0.038517,-0.097661,-0.073003,0.184059,0.044314,-0.064365,0.008723,-0.107354,...,0.011509,-0.040736,0.161662,0.204974,0.118762,0.117445,-0.00768,0.055788,0.102189,0.038435
3,0.245057,-0.020891,-0.026249,-0.151869,-0.106442,0.222537,0.011958,-0.043413,-0.058953,-0.028918,...,-0.013953,0.046868,0.037483,0.198855,0.064747,0.081032,-0.017029,0.042984,0.033856,0.081393
4,0.238914,-0.007153,-0.04859,-0.102492,-0.03088,0.085013,-0.028527,-0.070833,-0.021468,-0.082967,...,-0.011694,0.034723,0.093686,0.17643,0.084122,0.098101,0.028917,0.043652,0.008579,0.079708


In [10]:
X_test_features = X_test.apply(get_review_vector)

## Logistic regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# fit
LR_model = LogisticRegression()
LR_model = LR_model.fit(X_train_features, y_train)
y_predict = LR_model.predict(X_test_features)
y_test_np = y_test.values

# evaluate
accuracy = accuracy_score(y_test_np, y_predict)
print(f"accuracy: {accuracy}")

recall = recall_score(y_test_np, y_predict)
print(f"recall: {recall}")

precision = precision_score(y_test_np, y_predict)
print(f"precision: {precision}")

f1 = f1_score(y_test_np, y_predict)
print(f"f1: {f1}")

accuracy: 0.8686
recall: 0.8694601128122482
precision: 0.8663187474909675
f1: 0.8678865875728937


In [12]:
# use cross-validation to fit and evaluate evaluate the model
from sklearn.model_selection import cross_val_score
cross_val_model_LR = LogisticRegression()
scores = cross_val_score(cross_val_model_LR, pd.concat([X_train_features, X_test_features], axis=0), y_train.tolist()+y_test.tolist(), cv=10)
print("10 fold cross validation score of LogisticRegression:", np.mean(scores))

10 fold cross validation score of LogisticRegression: 0.86908


In [13]:
## test
test_data=pd.read_csv("data/testData.tsv",sep = "\t")
X_test=test_data['review'].apply(preprocess_data)
X_test_features = X_test.apply(get_review_vector)

ValueError: setting an array element with a sequence.

In [14]:
test_predicted = np.array(LR_model.predict(X_test_features))

lr_output = pd.DataFrame(data=test_predicted, columns=['sentiment'])
lr_output['id'] = test_data['id']
lr_output = lr_output[['id', 'sentiment']]
lr_output.to_csv('out/word2vec.csv', index=False)