In [1]:
'''
Investigate 2016 presidential election survey
'''

import pandas as pd
import numpy as np
import gensim.models as g
import sys 

#parameters
doc2vec_dir = "../model/enwiki_dbow/doc2vec.bin"

In [2]:
df = pd.read_csv('data/anes_timeseries_2016_rawdata.txt',delimiter='|',low_memory=False)

In [4]:
xls = pd.ExcelFile('data/anes_timeseries_2016_redacted_openends.xlsx')

In [6]:
open_response_dfs = []
sheet_names = xls.sheet_names
for name in sheet_names:
    open_response_df = xls.parse(name)
    open_response_dfs.append(open_response_df)

In [7]:
or_df = pd.concat(open_response_dfs,axis=1,ignore_index=True)

In [8]:
or_df = or_df.iloc[:,[0]+[i for i in range(1,58,2)]]

In [9]:
or_df.columns = ['id']+sheet_names

In [19]:
str_df = or_df.iloc[:,1:].astype(str)

In [20]:
def represents_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def replace_vals(s):
    if s == 'nan' or represents_int(s):
        return ''
    else:
        return s
    

In [32]:
str_df = str_df.applymap(replace_vals)
str_df = str_df.apply(lambda x: ' '.join(x), axis=1)

In [41]:
master = pd.concat([df,or_df,str_df],
                   axis=1,
                   ignore_index=True)
master.columns = list(df.columns)+list(or_df.columns)+['concat_text']

In [49]:
pretrained_model = g.Doc2Vec.load(doc2vec_dir)

In [50]:
master['weights'] = master['concat_text'].apply(pretrained_model.infer_vector)

In [54]:
weights = master['weights'].apply(pd.Series)

# rename each variable is tags
weights = weights.rename(columns = lambda x : 'weight_' + str(x))

# view the tags dataframe
master = pd.concat([master[:], weights[:]], axis=1)

In [56]:
master.to_csv('data/full_survey.csv')

In [66]:
selected = master[['V160001','V162007','V162008','V162062x','concat_text']+['weight_'+str(i) for i in range(300)]]
selected.columns = ['id','did_party_contact','did_others_contact','turnout','concat_text']+['weight_'+str(i) for i in range(300)]

In [67]:
selected

Unnamed: 0,id,did_party_contact,did_others_contact,turnout,concat_text,weight_0,weight_1,weight_2,weight_3,weight_4,...,weight_290,weight_291,weight_292,weight_293,weight_294,weight_295,weight_296,weight_297,weight_298,weight_299
0,300001,2,2,2,everything truth//his caring for the country ...,-0.058348,-0.109910,-0.161563,0.124331,-0.213583,...,-0.108896,0.457625,-0.208054,0.150968,-0.144023,-0.526571,-0.091394,0.077843,-0.143373,-0.492166
1,300002,2,2,2,seem corptupt anti gun anti 2nd amen pro 2nd ...,0.022091,-0.017459,-0.132043,0.105403,-0.253132,...,0.044367,0.454119,-0.189705,0.230009,-0.145188,-0.459279,-0.031582,0.190012,-0.212053,-0.455249
2,300003,2,2,-1,slitter/ tape inspection shipyard ...,0.080512,-0.051045,-0.235365,0.113168,-0.116118,...,-0.044489,0.244099,-0.051093,0.093490,-0.117737,-0.381794,-0.012404,0.160986,-0.175005,-0.235692
3,300004,2,2,2,whole personal//no he says what he thinks//I ...,-0.022947,0.058390,-0.162191,0.096363,-0.302070,...,0.010478,0.475794,-0.226209,0.346557,-0.111721,-0.711054,-0.102503,0.170056,-0.250038,-0.548502
4,300006,1,1,4,Im not really strong on their policies and th...,-0.155150,0.180996,-0.304161,0.128351,-0.137236,...,-0.108482,0.561908,-0.299957,0.116340,-0.161964,-0.555365,-0.049244,0.156528,-0.187157,-0.442421
5,300007,1,2,2,I would not vote for Hiliary it would take a...,-0.055377,-0.017513,-0.187191,0.124252,-0.275554,...,-0.068392,0.534003,-0.321184,0.185659,-0.094626,-0.705114,-0.096074,0.040634,-0.199515,-0.528671
6,300008,1,2,1,arrogrance plan for minorities sound more f...,-0.008185,-0.007399,-0.174515,0.054145,-0.117110,...,0.019710,0.464543,-0.238775,0.122591,-0.141891,-0.644122,-0.079919,0.203072,-0.148280,-0.326827
7,300012,1,2,2,I've been saying after President Clinton went...,-0.123860,0.083248,-0.181881,0.238248,-0.224421,...,-0.150553,0.507645,-0.162618,0.234308,-0.164529,-0.692146,-0.183336,0.029071,-0.178791,-0.520443
8,300018,2,2,-1,She's a liar. She's a cheat. She did not sh...,-0.004013,0.055501,-0.036210,0.109214,-0.098207,...,-0.131246,0.432858,-0.248013,0.232857,-0.120094,-0.676504,-0.198864,0.022898,-0.357159,-0.616315
9,300020,2,2,3,dental assistant Hospitality Tourisim...,-0.002891,-0.025254,-0.209523,0.149114,-0.235518,...,0.012251,0.360865,-0.214403,0.083883,-0.053006,-0.366364,-0.029930,0.145870,-0.129122,-0.238221


In [68]:
selected.to_csv('data/selected_survey.csv',index=False)