In [1]:
import pandas as pd
from sklearn.svm import LinearSVC

# Importing custom utility functions
from utilities.data_loader import load_modeling_data, load_testing_data, prepare_kaggle_submission
from utilities.text_cleaner import advanced_data_cleaning

# Importing modeling utilities
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
train_data, train_labels = load_modeling_data()

In [42]:
train_sentences = list(train_data['text'].apply(str.split).values)

In [32]:
model = Word2Vec(sentences=train_sentences, 
                 sg=1, 
                 workers=4)

In [16]:
list(model.wv.index_to_key)


['multistop',
 'i',
 'exclamationMark',
 'to',
 'the',
 'a',
 'is',
 'it',
 'my',
 'you',
 'and',
 'not',
 'in',
 'for',
 'questionMark',
 'am',
 'of',
 'have',
 'on',
 'me',
 'that',
 'so',
 'but',
 'just',
 'do',
 'with',
 'be',
 'are',
 'at',
 'wa',
 'day',
 'will',
 'this',
 'now',
 'can',
 'good',
 'up',
 'going',
 'get',
 'no',
 'all',
 'out',
 'like',
 'go',
 'u',
 'http',
 'work',
 'today',
 'love',
 'time',
 'too',
 'want',
 'your',
 'we',
 'got',
 'what',
 'lol',
 'know',
 'one',
 'back',
 'from',
 'com',
 'im',
 'about',
 'really',
 'night',
 'had',
 'there',
 'see',
 'did',
 'some',
 'andamp',
 'how',
 'andquot',
 'if',
 'they',
 'think',
 'still',
 'well',
 'new',
 'would',
 'need',
 'ha',
 'thanks',
 'home',
 'he',
 'oh',
 'when',
 'miss',
 'here',
 'more',
 'much',
 'off',
 'last',
 'morning',
 'an',
 'feel',
 'hope',
 'then',
 'make',
 'haha',
 'twitter',
 'tomorrow',
 'been',
 'great',
 'or',
 'her',
 'again',
 'wish',
 'she',
 'sad',
 'come',
 'fun',
 'week',
 'why',


In [34]:
model.wv.vector_size


100

In [36]:
model.wv.get_vector('Phone')

array([ 0.0617004 ,  0.15368834, -0.30466482, -0.09091772, -0.29671198,
       -0.32473722, -0.12110214,  0.5677969 , -0.06950313, -0.04396479,
       -0.30846474, -0.19760323, -0.00111749,  0.18611506,  0.08066562,
       -0.5225079 , -0.42478046, -0.5013711 , -0.0543109 , -0.25763696,
        0.2583714 ,  0.15878998,  0.43007207,  0.25645196,  0.23807997,
        0.24688737, -0.6870859 ,  0.14629991, -0.23371446,  0.07841577,
        0.1480483 , -0.02460455,  0.47732973, -0.37960696, -0.02724978,
        0.41526946, -0.02428965, -0.06622016, -0.21353182, -0.9388725 ,
       -0.01753375, -0.07331342, -0.10368375,  0.6707508 ,  0.02422952,
       -0.6419385 ,  0.31703216,  0.02765011,  0.66191643, -0.08976927,
       -0.03277675, -0.0918693 ,  0.5167764 , -0.12165214,  0.38509688,
        0.34030005, -0.08161294, -0.42406213, -0.17276758,  0.22395438,
       -0.5836187 ,  0.18239118, -0.26240957,  0.3153187 ,  0.1487198 ,
        0.14722373,  0.1810682 ,  0.3247819 ,  0.09473322,  0.04

In [3]:
import numpy as np
import pandas as pd
import gensim
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [4]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2, random_state=8)

In [6]:
X_train['text'] = X_train['text'].apply(advanced_data_cleaning)
X_val['text'] = X_val['text'].apply(advanced_data_cleaning)

In [7]:
le = LabelEncoder()
y_train['target'] = le.fit_transform(y_train['target'])
y_val['target'] = le.transform(y_val['target'])

In [8]:
x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in X_train['text']]

In [10]:
model = Word2Vec(x_tokenized,
                 vector_size=100
                 # Size is the length of our vector.
                )


In [15]:
model.wv.most_similar("free")

<gensim.models.keyedvectors.KeyedVectors at 0x290ff49d0>

In [12]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()


In [17]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = model.wv
             )

KeyboardInterrupt: 

In [None]:
test_vec = sequencer.textToVector("i am in love with you")
test_vec

In [None]:
test_vec.shape