In [36]:
import numpy as np
import pandas as pd
import os 

In [37]:
# Load transformed data
cwd = os.getcwd()
datadir = cwd + os.sep + 'data' + os.sep

def dummyVariables(data):
    mv_cols = (pd.isnull(data)).any()
    mv_cols = mv_cols[mv_cols == True]
    if(mv_cols.empty):
        return(data)
    for each_col, each_bool in zip(mv_cols.index.values, mv_cols):
        data[each_col+'_mv'] = np.where(np.isnan(data[each_col]), 1, 0)
        col_mean = data[each_col].mean()
        data[each_col] = data[each_col].fillna(col_mean)
        
    return(data)

def loadSentimentData(fileName):
    data = pd.read_csv(datadir + fileName, header=0, index_col=0)
    data.drop(['arousal_mv','valence_mv','label'], 1, inplace=True)
    data.index = data.index.astype('int64')
    
    return data

def loadUserData(fileName):
    data = pd.read_csv(datadir + fileName, header=0, encoding="cp1252")
    data = data[['id','favourites_count','followers_count','friends_count','listed_count','statuses_count', 'label', 'default_profile', 'default_profile_image','verified', 'reputation','taste']]
    data.set_index('id', inplace=True)
    data = dummyVariables(data)
    
    return data

def loadTimingData(fileName):
    data = pd.read_csv(datadir + fileName, header=0)
    data.set_index('user_id', inplace=True)
    data = dummyVariables(data)
    
    return data

def loadData(fileNames):
    sentiment = loadSentimentData(fileNames[0])
    account = loadUserData(fileNames[1])
    timing = loadTimingData(fileNames[2])
    data = account.join(sentiment, how='left')
    data = data.join(timing, how='left')
    data = data.dropna()
    data = data.loc[:, (data != 0).any(axis=0)]
        
    return data
    
data = loadData(['sentiment_dist_varol_dump.csv','merged.csv','timing.csv'])

dataout = datadir + "complete.csv"

data.to_csv(dataout)

