# GSS data

The official website of this data:
http://gss.norc.org/About-The-GSS

Preprocessing:

1. The original file is very large, with lots of repeated columns. All repeated columns are removed.
1. Columns with more than half of the values as nans are removed.
1. All categorical features are expended using onehot encoding.
1. The observation is defined by the year of the survey.
1. The prediction outcome is the self-reported GENERAL HAPPINESS of the subject, where more than 80% of them are positive. Labels are unbalanced.

Statistics:

records: 59599

features: 1394 (including subject id, observation id and outcome)

subjects: 4510

observations: 30 (ranging from 1972 - 2014)

density: 44.05%

In [47]:
import pandas as pd
import numpy as np

# driveLoc = "F:/onedrive"
# driveLoc = "/Users/jul672/Desktop/OneDrive"
driveLoc = "C:/Users/jokit/OneDrive"
fileLoc = driveLoc+'/phd projects/The General Social Survey (GSS)/'
file = fileLoc + 'gss.csv'

In [48]:
import re
def getColumnsWithLabel(x):
    target = []
    for each in x:
        if each.endswith('labels'):
            target.append(each.split('_')[0])
        elif re.search('labels.*$',each):
            sp1 = each.split('_')
            sp2 = sp1[1].split('.')
            target.append(sp1[0]+'.'+sp2[1])
    return target

def dropColumnsByName(x):
    # only keep columns with label and ends with label
    return not (x in dropColumns)

def dropColumnsByCountingNa(x):
    l = x.size
    na_c = x.isna().sum()
    iap_c = (x == 'IAP').sum()
    missing = na_c + iap_c
    if missing / l > .5: # more than half of the values are nas
        return False
    else:
        return True

def dropColumnsByCountingIAP(x):
    l = x.size
    iap_c = np.sum(['IAP' in str(each).split(',') for each in x])
    if iap_c / l > .5: # more than half of the values are nas
        return False
    else:
        return True

def dropNotApplicable(x):
    l = x.size
    nappl = (x == 'Not applicable').sum()
    if nappl / l > .5: # more than half of the values are nas
        return False
    else:
        return True

def dropNoISSP(x):
    l = x.size
    nissp = (x == 'IAP-NO ISSP').sum() + (x == 'NO ISSP').sum()
    if nissp / l > .5: # more than half of the values are nas
        return False
    else:
        return True

def dropUncodeable(x):
    l = x.size
    nissp = (x == 'UNCODEABLE & IAP').sum()
    if nissp / l > .5: # more than half of the values are nas
        return False
    else:
        return True
    
def renameColumns(x):
    return x.strip()
# data = data.rename(renameColumns, axis='columns')

In [49]:
# filter the columns
# data = pd.read_csv(file,nrows = 10000,usecols=useCols)
# dropColumns = getColumnsWithLabel(data.columns.values)
# data1 = data.loc[:,list(map(dropColumnsByName,data.columns.values))]
# # useCols = data1.columns.values
# keep1 = data1.apply(dropColumnsByCountingNa,axis=0)
# data2 = data1.loc[:,keep1]
# keep2 = data2.apply(dropColumnsByCountingIAP,axis=0)
# data3 = data2.loc[:,keep2]
# keep3 = data3.apply(dropNotApplicable,axis=0)
# data4 = data3.loc[:,keep3]
# keep4 = data4.apply(dropNoISSP,axis=0)
# data5 = data4.loc[:,keep4]
# keep5 = data5.apply(dropUncodeable,axis=0)
# data6 = data5.loc[:,keep5]
# useCols = data5.columns.values

# process the original dataset and get a much smaller one
# chunkSize = 100000
# data = pd.read_csv(file,nrows = chunkSize,usecols = useCols)
# data = data.rename(renameColumns, axis='columns')
# data.to_csv(fileLoc + 'gss_filter.csv',index = False)

In [50]:
# get the smaller data
data = pd.read_csv(fileLoc + 'gss_filter.csv')

In [51]:
# now summarize the column and delete those with only one value
cts = data.apply(lambda x:len(x.value_counts()),axis=0)  # all columns have more than one values

In [52]:
# the label is general happyness
label_ind = np.where(data.columns.values == 'GENERAL HAPPINESS_labels')[0][0]

In [53]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [54]:
ftypes = data.apply(lambda x: type(x.values[0]),axis = 0)  #all columns are numpy.float64
onehotMask = [x == str for x in ftypes]
onehotMask[label_ind] = False
tmpData = data.loc[:,onehotMask]

## format for LGPR

In [31]:
def categorizeFeature(x):
    idmap = {each:i for i,each in enumerate(set(x))}
    if len(idmap) > 1:
        return np.array([idmap[each] for each in x]).reshape(-1,1)
    return None

In [32]:
colnames = []
tmpData_np = np.array(tmpData.values)
cateRes = None
for each in range(tmpData_np.shape[1]):
    if cateRes is None:
        r = categorizeFeature(tmpData_np[:,each])
        if r is not None:
            cateRes = r
            colnames.append(tmpData.columns.values[each])
    else:
        r = categorizeFeature(tmpData_np[:,each])
        if r is not None:
            cateRes = np.concatenate([cateRes,r],axis=1)
            colnames.append(tmpData.columns.values[each])
colnames = np.array(colnames)            
cateCols = np.arange(len(colnames))

In [33]:
tmpContData = data.loc[:,~np.array(onehotMask)].drop(['RESPONDNT ID NUMBER','GENERAL HAPPINESS_labels'],axis=1)
stdModel = StandardScaler()
stdRes = stdModel.fit_transform(np.array(tmpContData))
# impute the missing values in the continuous feature
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=30, random_state=0)
stdRes = imp.fit_transform(stdRes)
colnames = np.concatenate([colnames,tmpContData.columns.values])
contCols = np.arange(len(cateCols),len(cateCols)+len(tmpContData.columns.values))

In [34]:
labels = data['GENERAL HAPPINESS_labels']
y = []
for each in labels:
#     if each == 'NOT TOO HAPPY':
#         rating.append(1)
    if each == 'PRETTY HAPPY' or each == 'VERY HAPPY':
        y.append(1)
    else:
        y.append(0)
y = np.array(y)

In [37]:
idCol = data['RESPONDNT ID NUMBER'].values
oidCol = data['GSS YEAR FOR THIS RESPONDENT'].values
X = np.concatenate([cateRes,stdRes],axis=1)

def getLastOfSplit(x):
    sp = x.split('_')
    return sp[len(sp)-1]

def filterLogic(x):
    if x == 'IAP' or x == 'DK' or x == 'NONE':
        return False
    elif 'IAP' in x or 'ISSP' in x:
        return False
    else:
        return True
v = [getLastOfSplit(x) for x in colnames]
v = [filterLogic(x) for x in v]
X = X[:,v]
colnames = colnames[v]

In [40]:
X.shape

(1500, 212)

In [39]:
# format a small dataset containing only several individuals
from collections import Counter
ct = Counter(idCol)
top = 50
targetIds = set([x[0] for x in ct.most_common()[:top]])
mask = [x in targetIds for x in idCol]
idCol = idCol[mask]
oidCol = oidCol[mask]
X = X[mask]
y = y[mask]

## format for other algorithms

In [55]:
colnames = []
onehotModel = OneHotEncoder()
onehotRes = onehotModel.fit_transform(np.array(tmpData.values).astype(str))
for i in range(len(onehotModel.categories_)):
    colnames.extend([tmpData.columns.values[i]+'_'+str(x) for x in onehotModel.categories_[i]])

In [56]:
data.head()

Unnamed: 0,GSS YEAR FOR THIS RESPONDENT,RESPONDNT ID NUMBER,LABOR FORCE STATUS_labels,R SELF-EMP OR WORKS FOR SOMEBODY_labels,MARITAL STATUS_labels,EVER BEEN DIVORCED OR SEPARATED_labels,SPOUSE LABOR FORCE STATUS_labels,SPOUSE SELF-EMP. OR WORKS FOR SOMEBODY_labels,FATHER SELF-EMP. OR WORKED FOR SOMEBODY_labels,RS HIGHEST DEGREE_labels,...,2ND MENTIONED COUNTRY OF SPOUSES ORIGIN_labels,3RD MENTIONED COUNTRY OF SPOUSES ORIGIN_labels,YEARS IN ARMED FORCES_labels,TAKE ACTIVE PART IN WORLD AFFAIRS_labels,REMAIN IN U.N. OR PULL OUT_labels,FEELINGS ABOUT COMMUNISM_labels,Weight deal with experimental randomization,SAMPLING FRAME AND METHOD_labels,WEIGHTS FOR BLACK OVERSAMPLES,Interviews Conducted in Spanish or English_labels
0,1972.0,1.0,WORKING FULLTIME,SOMEONE ELSE,NEVER MARRIED,IAP,IAP,IAP,SOMEONE ELSE,BACHELOR,...,UNCODEABLE & IAP,UNCODEABLE & IAP,IAP,IAP,IAP,IAP,1.0,1960 BQ,1.0,ENGLISH
1,1972.0,2.0,RETIRED,SOMEONE ELSE,MARRIED,NO,KEEPING HOUSE,IAP,SELF-EMPLOYED,LT HIGH SCHOOL,...,UNCODEABLE & IAP,UNCODEABLE & IAP,IAP,IAP,IAP,IAP,1.0,1960 BQ,1.0,ENGLISH
2,1972.0,3.0,WORKING PARTTIME,SOMEONE ELSE,MARRIED,NO,WORKING FULLTIME,SOMEONE ELSE,SOMEONE ELSE,HIGH SCHOOL,...,UNCODEABLE & IAP,UNCODEABLE & IAP,IAP,IAP,IAP,IAP,1.0,1960 BQ,1.0,ENGLISH
3,1972.0,4.0,WORKING FULLTIME,SOMEONE ELSE,MARRIED,NO,WORKING FULLTIME,SOMEONE ELSE,SOMEONE ELSE,BACHELOR,...,UNCODEABLE & IAP,UNCODEABLE & IAP,IAP,IAP,IAP,IAP,1.0,1960 BQ,1.0,ENGLISH
4,1972.0,5.0,KEEPING HOUSE,SOMEONE ELSE,MARRIED,NO,TEMP NOT WORKING,SOMEONE ELSE,SOMEONE ELSE,HIGH SCHOOL,...,UNCODEABLE & IAP,UNCODEABLE & IAP,IAP,IAP,IAP,IAP,1.0,1960 BQ,1.0,ENGLISH


In [57]:
tmpContData = data.loc[:,~np.array(onehotMask)].drop(['RESPONDNT ID NUMBER','GENERAL HAPPINESS_labels'],axis=1)
stdModel = StandardScaler()
stdRes = stdModel.fit_transform(np.array(tmpContData))
# impute the missing values in the continuous feature
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=30, random_state=0)
stdRes = imp.fit_transform(stdRes)
colnames = np.concatenate([colnames,tmpContData.columns.values])

In [58]:
labels = data['GENERAL HAPPINESS_labels']
y = []
for each in labels:
#     if each == 'NOT TOO HAPPY':
#         rating.append(1)
    if each == 'PRETTY HAPPY' or each == 'VERY HAPPY':
        y.append(1)
    else:
        y.append(0)
y = np.array(y)

In [59]:
idCol = data['RESPONDNT ID NUMBER'].values
oidCol = data['GSS YEAR FOR THIS RESPONDENT'].values
X = np.concatenate([onehotRes.toarray(),stdRes],axis=1)

def getLastOfSplit(x):
    sp = x.split('_')
    return sp[len(sp)-1]

def filterLogic(x):
    if x == 'IAP' or x == 'DK' or x == 'NONE':
        return False
    elif 'IAP' in x or 'ISSP' in x:
        return False
    else:
        return True
v = [getLastOfSplit(x) for x in colnames]
v = [filterLogic(x) for x in v]
X = X[:,v]
colnames = colnames[v]

In [61]:
X.shape

(59599, 1392)

In [14]:
# format a small dataset containing only several individuals
from collections import Counter
ct = Counter(idCol)
top = 50
targetIds = set([x[0] for x in ct.most_common()[:top]])
mask = [x in targetIds for x in idCol]
idCol = idCol[mask]
oidCol = oidCol[mask]
X = X[mask]
y = y[mask]

In [41]:
from collections import defaultdict

def splitTrainTest(ids, time, prop=.7, seed = 19):
    np.random.seed(seed)
    mask = np.array([np.random.uniform() < prop for x in ids])
    trainIdx, trainTime = ids[mask], time[mask]
    testIdx,testTime = ids[~mask],time[~mask]
    trainDict = defaultdict(list)
    testDict = defaultdict(list)
    
    for i,m in enumerate(mask):
        if m:
            trainDict[ids[i]].append((time[i],i))
        else:
            testDict[ids[i]].append((time[i],i))
    for k,v in trainDict.items():
        trainV = [x[0] for x in v]
        trainVid = [x[1] for x in v]
        tv = testDict.get(k,None)
        if tv is not None:
            testV = [x[0] for x in tv]
            testVid = [x[1] for x in tv]
            maxId = np.argmax(trainV)
            minId = np.argmin(testV)
            while trainV[maxId] > testV[minId]:
                tmp = trainV[maxId]
                trainV[maxId] = testV[minId]
                testV[minId] = tmp
                tmp = trainVid[maxId]
                trainVid[maxId] = testVid[minId]
                testVid[minId] = tmp
                maxId = np.argmax(trainV)
                minId = np.argmin(testV)
            trainDict[k] = [(x,y) for x,y in zip(trainV,trainVid)]
            testDict[k] = [(x,y) for x,y in zip(testV,testVid)]
    train = []
    test = []
    for k,v in trainDict.items():
        train.extend([x[1] for x in v])
    for k,v in testDict.items():
        test.extend([x[1] for x in v])
    return train, test

def getIndvFixFeature(ids,x):
    dt = defaultdict(list)
    for i,k in enumerate(ids):
        dt[k].append(x[i])
    noChange = np.repeat(True,x.shape[1])
    for k,v in dt.items():
        v = np.array(v)
        for i in range(v.shape[1]):
            if np.sum(v[:,i] - v[0,i]) != 0:
                noChange[i] = False
    return noChange

from scipy import io
def generate(seed = 19,density = 0.7,name = 'gss'):
    trainIdx, testIdx = splitTrainTest(idCol, X[:,np.where(colnames == 'GSS YEAR FOR THIS RESPONDENT')[0]].reshape(-1),density,seed)
    if name == 'gssLGPR':
        io.savemat(f'../gssLGPR_{seed}',{'trainId':idCol[trainIdx],'trainOid':oidCol[trainIdx],'trainX':X[trainIdx],'trainY':y[trainIdx],
                                  'testId':idCol[testIdx],'testOid':oidCol[testIdx],'testX':X[testIdx],'testY':y[testIdx],'colname':colnames,
                                        'cateCols':cateCols, 'contCols':contCols})
    else:
        io.savemat(f'../gss_{seed}',{'trainId':idCol[trainIdx],'trainOid':oidCol[trainIdx],'trainX':X[trainIdx],'trainY':y[trainIdx],
                                  'testId':idCol[testIdx],'testOid':oidCol[testIdx],'testX':X[testIdx],'testY':y[testIdx],'colname':colnames})

In [46]:
generate(seed = 14,name='gssLGPR')