In [1]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [2]:
DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments= list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

In [3]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO )

In [6]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers = num_workers, \
                         vector_size = num_features, min_count = min_word_count,\
                         window = context, sample = downsampling)

2023-03-29 13:02:20,467 : INFO : collecting all words and their counts
2023-03-29 13:02:20,473 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-29 13:02:20,877 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2023-03-29 13:02:21,328 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2023-03-29 13:02:21,531 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2023-03-29 13:02:21,532 : INFO : Creating a fresh vocabulary
2023-03-29 13:02:21,629 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8160 unique words (11.02% of original 74065, drops 65905)', 'datetime': '2023-03-29T13:02:21.629958', 'gensim': '4.3.1', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2023-03-29 13:02:21,632 : INFO : Word2Vec lifecycle event {'ms

In [7]:
model_name = "300features_40minwords_10contest"
model.save(model_name)

2023-03-29 13:03:00,300 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10contest', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-29T13:03:00.300930', 'gensim': '4.3.1', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2023-03-29 13:03:00,302 : INFO : not storing attribute cum_table
2023-03-29 13:03:00,342 : INFO : saved 300features_40minwords_10contest


In [9]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)
    
    num_words = 0
    index2word_set = set(model.wv.index_to_key)
    
    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[w])
            
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [12]:
def get_dataset(reviews, model, num_features):
    dataset = list()
    
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
    
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [14]:
test_data_vecs = get_dataset(sentences, model, num_features)

In [16]:
from sklearn.model_selection import train_test_split

X = test_data_vecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
pip install sk