In [201]:
from scipy.sparse import load_npz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

import pandas as pd
import numpy as np
import json

In [173]:
class CleanAndVectorize(object):

    def __init__(self,**kwargs):
        max_df = kwargs.get('max_df',.9)
        max_features = kwargs.get('max_features', 1000)
        self.vectorizer = TfidfVectorizer(
            strip_accents='unicode',
            lowercase=True,
            analyzer='word',
            max_df=max_df,
            max_features=max_features
        )
        self.tokenizer = self.vectorizer.build_tokenizer()
        self.cols_to_extract = [
            'aft_id',
            'aft_page',
            'aft_page_revision',
            'aft_user',
            'aft_user_text',
            'aft_comment',
            'aft_noaction',
            'aft_inappropriate',
            'aft_helpful',
            'aft_unhelpful'
        ]

    def process(self, observations, save_tokens=False, remove_zero=True, debug=False):
        observations = observations[self.cols_to_extract]
        observations['aft_comment'] = observations['aft_comment'].astype(str)
        observations['aft_net_sign_helpful'] = np.sign(
            observations['aft_helpful'] - observations['aft_unhelpful']).astype(int)
        if remove_zero:
            observations = observations.loc[observations['aft_net_sign_helpful'] != 0]
        
        text_list = ['','']
        for _, obs in observations.iterrows():
            if obs['aft_net_sign_helpful'] < 0:
                text_list[0] = '{0} {1}'.format(text_list[0],obs['aft_comment'])
            else:
                text_list[1] = '{0} {1}'.format(text_list[1],obs['aft_comment'])
        feature_vectors = self.vectorizer.fit_transform(text_list)
        return observations, feature_vectors

In [194]:
class CleanAndVectorize(object):

    def __init__(self,**kwargs):
        max_df = kwargs.get('max_df',.9)
        max_features = kwargs.get('max_features', 1000)
        self.vectorizer = TfidfVectorizer(
            strip_accents='unicode',
            lowercase=True,
            analyzer='word',
            max_df=max_df,
            max_features=max_features
        )
        self.tokenizer = self.vectorizer.build_tokenizer()
        self.cols_to_extract = [
            'aft_id',
            'aft_page',
            'aft_page_revision',
            'aft_user',
            'aft_user_text',
            'aft_comment',
            'aft_noaction',
            'aft_inappropriate',
            'aft_helpful',
            'aft_unhelpful'
        ]

    def process(self, observations, save_tokens=False, remove_zero=True, debug=False):
        if debug:
            observations = observations.sample(debug)
        observations = observations[self.cols_to_extract]
        observations['aft_comment'] = observations['aft_comment'].astype(str)
        observations['aft_net_sign_helpful'] = np.sign(
            observations['aft_helpful'] - observations['aft_unhelpful']).astype(int)
        if remove_zero:
            observations = observations.loc[observations['aft_net_sign_helpful'] != 0]
        if save_tokens:
            observations['tokenized_text'] = observations['aft_comment'].apply(self.tokenizer)
        #observations['feature_vector'] = self.vectorizer.fit_transform(observations['aft_comment'].values).toarray().tolist()
        feature_vectors = self.vectorizer.fit_transform(observations['aft_comment'].values)
        return observations, feature_vectors

In [195]:
INFILE = '/Users/klogg/research_data/aft/raw/dump_03-24-20.csv'

cv = CleanAndVectorize(max_df=.9, max_features=100000)

dtypes = {
    'aft_id': object,
    'aft_helpful':int,
    'aft_unhelpful':int
}

df = pd.read_csv(INFILE, escapechar='\\', encoding='latin-1', dtype=dtypes)
observations, feature_vectors = cv.process(df, save_tokens=True, debug=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [205]:
top_n = 100

labels = observations['aft_net_sign_helpful'].to_numpy()
feature_names = cv.vectorizer.get_feature_names()

features_chi2 = chi2(feature_vectors,labels)[0]
indicies = np.argsort(features_chi2)
for i in indicies[-top_n:]:
    print('{0}: {1:.4f}'.format(feature_names[i],features_chi2[i]))

sucks: 19.3708
that: 19.5707
perhaps: 19.8789
adding: 20.3106
informative: 20.3243
mention: 20.3534
everything: 20.6170
needed: 20.6767
useful: 20.9557
other: 21.0528
if: 21.1895
your: 21.6621
their: 21.7924
nope: 21.9522
pronunciation: 22.6815
different: 22.7706
obama: 22.7992
download: 22.8898
were: 22.9416
few: 22.9905
but: 23.0007
helpful: 23.2844
email: 23.3119
phone: 23.4288
family: 23.6001
facebook: 23.9470
section: 25.1392
with: 25.1882
little: 25.4117
address: 25.9737
reception: 26.2511
maybe: 26.3560
it: 26.6384
references: 26.8917
lol: 26.9096
shut: 27.0970
what: 27.5138
background: 27.6367
minecraft: 28.3965
they: 28.8903
or: 29.0671
mom: 29.8210
please: 30.1139
pics: 30.8521
detail: 31.0489
was: 31.2600
info: 31.4525
etc: 31.4572
there: 32.0241
list: 33.0230
should: 33.2103
include: 33.2965
in: 33.9470
nothing: 34.1422
to: 34.4748
links: 34.7059
well: 35.5004
each: 35.8917
her: 37.5981
detailed: 37.6814
sex: 37.7126
personal: 38.0059
dont: 38.1925
hate: 38.9941
summary: 39

In [204]:
top_n = 50

labels = observations['aft_net_sign_helpful'].to_numpy()
feature_names = cv.vectorizer.get_feature_names()

features_chi2 = mutual_info_classif(feature_vectors.toarray(),labels,discrete_features=False)
indicies = np.argsort(features_chi2)
for i in indicies[-top_n:]:
    print('{0}: {1:.4f}'.format(feature_names[i],features_chi2[i]))

KeyboardInterrupt: 

In [191]:
top_n = 50

feature_names = cv.vectorizer.get_feature_names()
feature_vectors_array = feature_vectors.toarray()
indicies = np.argsort(feature_vectors_array[0])
print('neg:')
for i in indicies[-top_n:]:
    print('{0}: {1:.4f}'.format(feature_names[i],feature_vectors_array[0][i]))

print()
print('*'*10)
print()

indicies = np.argsort(feature_vectors_array[1])
print('pos:')
for i in indicies[-top_n:]:
    print('{0}: {1:.4f}'.format(feature_names[i],feature_vectors_array[1][i]))

neg:
transition: 0.0222
applemicrosoft: 0.0222
siswa: 0.0222
kj: 0.0222
quail: 0.0222
jh: 0.0222
init: 0.0222
worley: 0.0222
foursquare: 0.0222
porno: 0.0222
sawas: 0.0222
rhodes: 0.0241
nut: 0.0241
scissorhands: 0.0241
fistulas: 0.0241
romanian: 0.0241
4i: 0.0241
seating: 0.0241
zaki: 0.0259
gillard: 0.0259
dd: 0.0259
buttermilk: 0.0259
giftcode: 0.0259
maa: 0.0259
relay: 0.0259
asdfghjkl: 0.0278
hansen: 0.0278
auvr: 0.0278
lithium: 0.0278
fuk: 0.0296
retarded: 0.0296
curry: 0.0296
wob: 0.0315
startup: 0.0315
20minutes: 0.0315
assigned: 0.0333
kokesh: 0.0370
karate: 0.0389
ora: 0.0426
woah: 0.0426
spfile: 0.0555
ermmmmmmmmmmmmmmmmmmmmmmmmm: 0.1813
ermmmmmmmmmmmmmmmmmmmm: 0.1813
3n27958nv: 0.2202
cmu: 0.2202
zqw: 0.2202
omnomnom: 0.2220
pud: 0.3090
20te: 0.4182
p3n15: 0.4385

**********

pos:
vanniyars: 0.0324
celsius: 0.0324
croats: 0.0324
pringles: 0.0324
ibraahin: 0.0324
presumably: 0.0324
weil: 0.0324
flamingo: 0.0324
sessions: 0.0324
malthusian: 0.0370
lakan: 0.0370
explicit: 0.03

In [107]:
tfidf = {
    1:{},
    -1:{}
}

for obs, feature in zip(feature_vectors.nonzero()[0],feature_vectors.nonzero()[1]):
    obs_class = observations.iloc[obs]['aft_net_sign_helpful']
    token = feature_names[feature]
    if token in tfidf[obs_class]:
        tfidf[obs_class][token] += feature_vectors[obs, feature]
    else:
        tfidf[obs_class][token] = feature_vectors[obs, feature]
        

In [109]:
top_n = 25

for obs_class in tfidf:
    print('class: {0}'.format(obs_class))
    sorted_tfidf = {k: v for k, v in sorted(tfidf[obs_class].items(), key=lambda item: item[1], reverse=True)}
    for i, token in enumerate(sorted_tfidf):
        print('{0}: {1}'.format(token, sorted_tfidf[token]))
        if i == top_n:
            break
    print()
    print('*'*10)
    print()

class: 1
the: 2253.2213637062305
more: 1888.421852323706
of: 1612.594229730487
this: 1226.937470296475
to: 1203.986826582594
it: 1183.1232542671294
and: 1180.7781501767406
needs: 1166.7898404153138
is: 1060.1023408875935
article: 968.7549639196919
pictures: 959.3765120197845
information: 924.9507422343146
picture: 857.4648280967008
in: 836.2215387181319
about: 800.0011081570904
for: 780.0313331280603
was: 693.5819209274359
what: 685.8971570741627
be: 654.1125047147211
on: 592.7590879933218
nan: 567.4522076787077
you: 559.8571431671154
how: 524.9112229432641
info: 524.437593655808
good: 506.8264207536145
need: 462.2928231883462

**********

class: -1
the: 1977.263976925312
more: 1447.8338743330187
nan: 1389.219533171486
it: 1303.4472252788935
of: 1300.591966263876
is: 1298.3339090941276
to: 1289.9813943892116
this: 1206.13471235249
no: 1031.5590669117955
and: 964.2355426730195
needs: 891.6641991657501
for: 867.5181715152638
in: 854.1685924680457
you: 765.1862598559724
article: 724.35973

In [77]:
feature_names

['00',
 '000',
 '0000',
 '000000',
 '0000000000',
 '000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000',
 '00000001',
 '00000456',
 '000013124360187',
 '0002h',
 '0005',
 '000british',
 '000lbs',
 '000th',
 '0010101',
 '00109',
 '00212',
 '003',
 '004',
 '00437',
 '0050',
 '007',
 '0088890i0l',
 '0090',
 '00pm',
 '01',
 '010',
 '0101001110',
 '010101011010',
 '01010110',
 '010101110',
 '0101110',
 '01078',
 '013053003818147',
 '01341',
 '01379945',
 '01401550041',
 '014945400',
 '0199838801',
 '0199838806',
 '02',
 '020',
 '02095',
 '021',
 '0222222',
 '0228',
 '023',
 '025',
 '0271',
 '029',
 '02a',
 '03',
 '03056932896',
 '03073121566',
 '03149107062',
 '03157739991',
 '03330122575',
 '03330122576',
 '03818',
 '03mmi',
 '04',
 '0431',
 '04345635345556422244560758649923490540938',
 '044',
 '044839917',
 '0456',
 '047',
 '04_sj9spykssy0xplmnmz0vm0y_qjzkld4x3txdul8h2vaqaurh_yw',
 '05',
 '053',
 '055'

In [85]:
feature_vectors[0, 67861]

0.2096554749343858

In [113]:
print(feature_vectors)

  (0, 46216)	0.0018502313882163563
  (0, 21194)	0.0018502313882163563
  (0, 37319)	0.0018502313882163563
  (0, 14221)	0.0018502313882163563
  (0, 44126)	0.0037004627764327127
  (0, 51153)	0.01480185110573085
  (0, 19570)	0.0018502313882163563
  (0, 23451)	0.0037004627764327127
  (0, 36509)	0.022202776658596275
  (0, 12548)	0.0018502313882163563
  (0, 12215)	0.0018502313882163563
  (0, 4061)	0.0018502313882163563
  (0, 21761)	0.0037004627764327127
  (0, 27800)	0.0018502313882163563
  (0, 3939)	0.0018502313882163563
  (0, 19537)	0.02590323943502899
  (0, 18276)	0.0018502313882163563
  (0, 28911)	0.005550694164649069
  (0, 8761)	0.0037004627764327127
  (0, 49578)	0.005550694164649069
  (0, 31990)	0.0018502313882163563
  (0, 47144)	0.0018502313882163563
  (0, 19160)	0.0018502313882163563
  (0, 21224)	0.0018502313882163563
  (0, 41800)	0.0018502313882163563
  :	:
  (1, 37076)	0.004621909120737779
  (1, 27133)	0.004621909120737779
  (1, 24264)	0.004621909120737779
  (1, 44428)	0.004621909120

In [179]:
feature_vectors

<2x51634 sparse matrix of type '<class 'numpy.float64'>'
	with 51634 stored elements in Compressed Sparse Row format>