In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import sys

In [2]:
FILEPATH = '/Users/klogg/research_data/aft/raw/dump_03-24-20.csv'

dtypes = {
    'aft_id': object,
    'aft_comment':str,
    'aft_helpful':int,
    'aft_unhelpful':int
}

df = pd.read_csv(
    FILEPATH,
    escapechar='\\',
    encoding='latin-1',
    dtype=dtypes
)

In [4]:
vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    max_df=.9,
    max_features=100000
)
tokenizer = vectorizer.build_tokenizer()

cols_to_extract = [
    'aft_id',
    'aft_page',
    'aft_page_revision',
    'aft_user',
    'aft_user_text',
    'aft_comment',
    'aft_noaction',
    'aft_inappropriate',
    'aft_helpful',
    'aft_unhelpful'
]

def process(observations, save_tokens=False, debug=False):
    if debug:
        observations = observations.sample(debug)
    observations = observations[cols_to_extract]
    observations['aft_comment'] = observations['aft_comment'].astype(str)
    observations['aft_net_sign_helpful'] = np.sign(observations['aft_helpful']-observations['aft_unhelpful']).astype(int)
    if save_tokens:
        observations['tokenized_text'] = observations['aft_comment'].apply(tokenizer)
    observations['feature_vector'] = vectorizer.fit_transform(observations['aft_comment'].values)
    return observations

observations = process(df,save_tokens=True,debug=10)

Unnamed: 0,aft_id,aft_page,aft_page_revision,aft_user,aft_user_text,aft_comment,aft_noaction,aft_inappropriate,aft_helpful,aft_unhelpful,aft_net_sign_helpful,tokenized_text,feature_vector
840362,87475000000000000000000000000000,535865,481753920,0,194.171.252.101,,0,0,0,0,0,[nan],"(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
778775,81576300000000000000000000000000,437943,529789529,0,74.72.147.228,Treatments and possible future treatments,0,0,0,0,0,"[Treatments, and, possible, future, treatments]","(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
644165,68700300000000000000000000000000,264917,524959139,0,206.78.191.85,hansel and gretel\ndoesn't want to get on\nlai...,0,0,0,1,-1,"[hansel, and, gretel, doesn, want, to, get, on...","(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
41867,050b038a3c861027e9fe90b11c278d5a,244932,584632318,0,203.45.240.205,,0,0,0,0,0,[nan],"(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
212377,26415400000000000000000000000000,7066937,504006206,0,117.213.135.57,Wonderful. Thank you it is useful for my proje...,0,0,1,0,1,"[Wonderful, Thank, you, it, is, useful, for, m...","(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
226314,27795200000000000000000000000000,977935,507229271,0,1.23.160.227,,0,0,0,0,0,[nan],"(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
181376,23352300000000000000000000000000,3524766,505623973,0,174.29.175.48,I just bought a new computer. How do I get yo...,1,0,0,1,-1,"[just, bought, new, computer, How, do, get, yo...","(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
396218,44514600000000000000000000000000,511961,511986951,0,66.108.127.79,,0,0,0,0,0,[nan],"(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
780925,81781500000000000000000000000000,16083989,531446887,0,111.68.105.221,give imvic result table for clostridium' strains,0,0,0,0,0,"[give, imvic, result, table, for, clostridium,...","(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."
742115,78069100000000000000000000000000,25855957,529612432,0,187.200.75.39,This article could use a diagram for a deeper ...,1,0,0,0,0,"[This, article, could, use, diagram, for, deep...","(0, 31)\t1.0\n (1, 45)\t0.7812778040619008\..."


In [34]:
x = vectorizer.fit_transform(observations['aft_comment'].values)[1]
y = vectorizer.fit_transform(observations['aft_comment'].values)

In [37]:
x

<1x54 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [18]:
type(tokenizer('This is text'))

list

In [23]:
'this is a test'.split(' ')

['this', 'is', 'a', 'test']