In [40]:
import csv
import json
import argparse
import logging
import functools
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer

class CleanAndVectorize(object):

    def __init__(self, en_kvs_path, **kwargs):
        max_df = kwargs.get('max_df',.9)
        max_features = kwargs.get('max_features', 1000)
        self.tfidf_vectorizer = TfidfVectorizer(
            strip_accents='unicode',
            lowercase=True,
            analyzer='word',
            max_df=max_df,
            max_features=max_features
        )
        self.w2v_vectorizer = KeyedVectors.load(en_kvs_path,mmap='r')
        self.tokenizer = self.tfidf_vectorizer.build_tokenizer()
        self.cols_to_extract = [
            'aft_id',
            'aft_page',
            'aft_page_revision',
            'aft_user',
            'aft_user_text',
            'aft_comment',
            'aft_noaction',
            'aft_inappropriate',
            'aft_helpful',
            'aft_unhelpful',
            'aft_rating'
        ]

    def get_token_vector(self, token):
         if token in self.w2v_vectorizer:
            return self.w2v_vectorizer[token]
         else:
            return None

    def get_sentence_vector(self, token_list):
        vector_list = np.array([self.get_token_vector(x) for x in token_list if self.get_token_vector(x) is not None])
        sentence_vector = np.mean(vector_list,axis=0)
        return sentence_vector

    def get_feature_vector(self,observation,add_rating=False):
        feature_vector = self.get_sentence_vector(observation['tokenized_text'])
        if add_rating:
            feature_vector = np.append(feature_vector,observation['aft_rating'])
        return feature_vector

    def process(self, observations, save_tokens=False, remove_zero=True, debug=False, add_rating=False):
        if debug:
            observations = observations.sample(debug)
        observations = observations[self.cols_to_extract]
        observations['aft_comment'] = observations['aft_comment'].astype(str)
        observations['aft_net_sign_helpful'] = np.sign(
            observations['aft_helpful'] - observations['aft_unhelpful']).astype(int)
        if remove_zero:
            observations = observations.loc[observations['aft_net_sign_helpful'] != 0]
        observations['tokenized_text'] = observations['aft_comment'].apply(self.tokenizer)
        observations['feature_vector'] = observations[['tokenized_text','aft_rating']].apply(
            self.get_feature_vector,
            axis=1,
            add_rating=add_rating)
        if not save_tokens:
            observations.drop(labels='tokenized_text',axis=1,inplace=True)
        return observations

In [41]:
INFILE = '/Users/klogg/research_data/aft/raw/dump_03-24-20.csv'
EN_KVS_PATH = '../word2vec/aft_2021-03-30_learned_vectors.50_cell.10k.kv'
MAX_DF = .9
MAX_FEATURES = 10000

def main():
    cv = CleanAndVectorize(en_kvs_path=EN_KVS_PATH,
                           max_df=MAX_DF,
                           max_features=MAX_FEATURES)

    dtypes = {
        'aft_id': object,
        'aft_helpful':int,
        'aft_unhelpful':int
    }

    df = pd.read_csv(INFILE, escapechar='\\', encoding='latin-1', dtype=dtypes)
    observations = cv.process(df, save_tokens=True, debug=False, add_rating=True)
    return observations

obs = main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [42]:
obs['feature_vector'].values[0]

array([-1.43821940e-01,  6.97671846e-02, -2.36414969e-01,  5.72709600e-03,
        1.55032529e-02, -3.16005737e-01,  6.57286495e-02,  2.02870175e-01,
       -1.26426026e-01,  2.98555940e-04, -1.14737488e-01, -2.20336363e-01,
        1.60859346e-01,  2.10554555e-01,  2.38807157e-01, -1.91290855e-01,
        2.99929958e-02, -8.76369253e-02, -7.23370984e-02, -2.70107120e-01,
        1.94340423e-01,  1.10746957e-01,  1.49565518e-01,  1.41683578e-01,
       -6.05661124e-02,  1.36302054e-01,  1.49840489e-03, -3.99364494e-02,
       -5.90743423e-02,  3.13654095e-02,  1.06537096e-01,  1.21368235e-02,
        1.50465026e-01, -1.14219040e-01, -3.97400744e-02,  1.88044310e-01,
        5.09557612e-02, -2.81467915e-01, -7.31434152e-02, -2.49296010e-01,
        3.87942907e-03, -9.02001858e-02,  6.95658028e-02,  6.47363588e-02,
        2.53096193e-01, -2.67112255e-02, -2.62890846e-01, -2.78336871e-02,
        1.85242575e-02, -8.21775869e-02,  1.93805590e-01, -2.16914579e-01,
       -1.72123805e-01, -

In [57]:
cv = CleanAndVectorize(en_kvs_path=EN_KVS_PATH,
                       max_df=MAX_DF,
                       max_features=MAX_FEATURES)

obs['tokenized_text'].values[0]

print(obs['tokenized_text'].values[0])

v1 = cv.get_token_vector(obs['tokenized_text'].values[0][0])[0]
v2 = cv.get_token_vector(obs['tokenized_text'].values[0][1])[0]
print((v1+v2)/2)


cv.get_sentence_vector(obs['tokenized_text'].values[0][0:2])

['would', 'like', 'to', 'see', 'slow', 'motion', 'video', 'of', 'the', 'bird', 'in', 'flight']
0.05682149529457092


array([ 0.0568215 , -0.3066395 , -0.28566322, -0.13666338, -0.05532772,
       -0.3608138 ,  0.0075072 ,  0.16905984, -0.13999851,  0.13351338,
        0.13737293, -0.22367471,  0.2647468 ,  0.22383589,  0.44950858,
       -0.2891215 ,  0.1507743 , -0.4213552 ,  0.14426196, -0.47709417,
        0.14366364,  0.2021275 ,  0.03864039,  0.09420528, -0.06705882,
        0.07575061, -0.050212  ,  0.2046325 , -0.01465037, -0.11790362,
        0.24675864, -0.02687174,  0.10321139, -0.33421355,  0.13988695,
       -0.11801925, -0.09370565, -0.2274586 ,  0.18588531,  0.05070408,
        0.19288698,  0.06702584,  0.22366005,  0.16435963,  0.3781227 ,
        0.0137325 , -0.14515842,  0.195574  , -0.19217825, -0.4598347 ,
        0.1323308 , -0.26630846, -0.29761618, -0.2867014 , -0.00855857,
       -0.03021667,  0.35198584, -0.10469019,  0.03362226,  0.00457836,
       -0.15503244, -0.18691689,  0.2599187 ,  0.20181444, -0.25708008,
        0.2788821 , -0.10041464,  0.39921364, -0.217136  ,  0.46