In [69]:
'''
Investigate 2016 presidential election survey
'''

import pandas as pd
import numpy as np
import gensim.models as g
import sys 

#parameters
doc2vec_dir = "../model/enwiki_dbow/doc2vec.bin"

In [70]:
df = pd.read_csv('data/anes_timeseries_2016_rawdata.txt',delimiter='|',low_memory=False)

In [71]:
xls = pd.ExcelFile('data/anes_timeseries_2016_redacted_openends.xlsx')

In [77]:
open_response_dfs = []
sheet_names = ['V161069', 'V161072', 'V161075', 'V161078', 'V161098', 'V161101', 'V161104', 'V161106']
for name in sheet_names:
    open_response_df = xls.parse(name)
    open_response_dfs.append(open_response_df)

In [78]:
or_df = pd.concat(open_response_dfs,axis=1,ignore_index=True)

In [83]:
or_df = or_df.iloc[:,[0]+[i for i in range(1,or_df.shape[1],2)]]

In [84]:
or_df.columns = ['id']+sheet_names

In [86]:
str_df = or_df.iloc[:,1:].astype(str)

In [87]:
def represents_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def replace_vals(s):
    if s == 'nan' or represents_int(s):
        return ''
    else:
        return s
    

In [88]:
str_df = str_df.applymap(replace_vals)
str_df = str_df.apply(lambda x: ' '.join(x), axis=1)

In [89]:
master = pd.concat([df,or_df,str_df],
                   axis=1,
                   ignore_index=True)
master.columns = list(df.columns)+list(or_df.columns)+['concat_text']

In [90]:
pretrained_model = g.Doc2Vec.load(doc2vec_dir)

In [91]:
master['weights'] = master['concat_text'].apply(pretrained_model.infer_vector)

In [95]:
weights = master['weights'].apply(pd.Series)

# rename each variable is tags
weights = weights.rename(columns = lambda x : 'weight_' + str(x))

# view the tags dataframe
master = pd.concat([master[:], weights[:]], axis=1)

In [97]:
master.to_csv('data/full_survey.csv')

In [98]:
selected = master[['V160001','V162007','V162008','V162062x','V161086','V161087','concat_text']+['weight_'+str(i) for i in range(300)]]
selected.columns = ['id','did_party_contact','did_others_contact','turnout','democrat_feeling','republican_feeling','concat_text']+['weight_'+str(i) for i in range(300)]

In [99]:
selected

Unnamed: 0,id,did_party_contact,did_others_contact,turnout,democrat_feeling,republican_feeling,concat_text,weight_0,weight_1,weight_2,...,weight_290,weight_291,weight_292,weight_293,weight_294,weight_295,weight_296,weight_297,weight_298,weight_299
0,300001,2,2,2,0,85,everything truth//his caring for the country ...,-0.053707,-0.191859,-0.219631,...,-0.086516,0.285336,-0.171051,0.077810,-0.166286,-0.486506,-0.046697,0.186846,-0.136606,-0.364293
1,300002,2,2,2,0,85,seem corptupt anti gun anti 2nd amen pro 2nd ...,0.040343,-0.014408,-0.128700,...,-0.088125,0.371820,-0.105956,0.134573,-0.307824,-0.373700,-0.083725,0.208891,-0.075106,-0.418050
2,300003,2,2,-1,50,60,,0.001394,-0.001186,0.000793,...,-0.000124,-0.000521,0.000672,0.001139,-0.000301,0.001090,0.000927,0.000596,0.001134,0.001321
3,300004,2,2,2,0,60,whole personal//no he says what he thinks//I ...,0.098799,-0.022688,-0.105045,...,-0.080464,0.287939,-0.205737,0.150718,-0.090593,-0.358324,-0.103663,0.285628,-0.164972,-0.460912
4,300006,1,1,4,15,0,Im not really strong on their policies and th...,-0.004944,-0.004936,-0.185560,...,-0.042696,0.449314,-0.323229,0.067368,-0.096637,-0.372834,-0.161243,0.148895,-0.334353,-0.375380
5,300007,1,2,2,0,50,I would not vote for Hiliary it would take a...,-0.075881,0.013713,-0.278348,...,-0.107659,0.483715,-0.269429,0.224164,-0.137351,-0.662970,-0.107697,0.031840,-0.218760,-0.510470
6,300008,1,2,1,85,0,arrogrance plan for minorities sound more f...,0.067281,0.013301,-0.074185,...,0.040503,0.199224,-0.066527,0.068928,-0.170669,-0.307895,-0.094678,0.081109,-0.109395,-0.195961
7,300012,1,2,2,15,85,I've been saying after President Clinton went...,-0.071236,-0.051538,-0.174273,...,-0.097493,0.433247,-0.286675,0.306653,-0.268245,-0.763997,-0.016654,0.024661,-0.224256,-0.544809
8,300018,2,2,-1,15,60,She's a liar. She's a cheat. She did not sh...,-0.054570,0.047149,0.008076,...,-0.082131,0.475507,-0.231428,0.331654,-0.133867,-0.652070,-0.088656,0.013910,-0.294416,-0.562680
9,300020,2,2,3,60,50,,0.001394,-0.001186,0.000793,...,-0.000124,-0.000521,0.000672,0.001139,-0.000301,0.001090,0.000927,0.000596,0.001134,0.001321


In [100]:
selected.to_csv('data/selected_survey.csv',index=False)