In [None]:
import pandas as pd
import math
import time
import sys

from sklearn import cross_validation
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.grid_search import RandomizedSearchCV

from scipy.stats import uniform

In [None]:
#Change to work with your setup
RESULTS_FILE_NAME = "data/prediction_results.csv"
GROUND_TRUTH_FILE = "data/data_analytics_cleaned.csv" #ground truth personality and gender in here

In [None]:
#RUN IF YOU DON'T HAVE RESULTS YET
results = pd.DataFrame(columns=['label','classifier','trait','metric','result'])

In [None]:
#LOAD EXISTING RESULTS
results = pd.read_csv(RESULTS_FILE_NAME)
results = results.drop("Unnamed: 0",axis=1)

In [None]:
def predict(df, df_label, groundTruth, results, labels, linearRegression = True, svr = True, randomForest = True,
           randomForest_pca = True, svr_pca = True, linearRegression_pca = True):
    full_start = time.time()
    cleaned_df = df[labels+['userid']]
    dropped = ['userid','bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']
    merged = cleaned_df.merge(groundTruth[dropped])
    
    #filter out nan values
    merged = merged.dropna(how='any')
    
    for clfIndex in range(6): #Try two regression techniques
        if clfIndex == 0 and not linearRegression:
            continue
        elif clfIndex == 1 and not svr:
            continue
        elif clfIndex == 2 and not randomForest:
            continue
        elif clfIndex == 3 and not randomForest_pca:
            continue
        elif clfIndex == 4 and not svr_pca:
            continue
        elif clfIndex == 5 and not linearRegression_pca:
            continue
                
        for trait in ['bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']:
            if clfIndex == 0:
                print("Linear regression, no PCA",trait)
            elif clfIndex == 1:
                print("SVR, no PCA",trait)
            elif clfIndex == 2:
                print("Random Forest, no PCA",trait)
            elif clfIndex == 3:
                print("Random Forest, PCA",trait)
            elif clfIndex == 4:
                print("SVR, PCA",trait)
            elif clfIndex == 5:
                print("Linear regression, PCA",trait)
                
            start = time.time()
            results = predict_one(merged,df_label,clfIndex,trait,results,labels)
            print("One prediction took",time.time()-start,"seconds")
            
    print("Full prediction routine took",time.time()-full_start,"seconds")
    return results

In [None]:
def predict_one(df, df_label, clfIndex, trait, results,labels):
    dropped = ['userid','bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']
    if clfIndex == 0 or clfIndex == 5:
        clf = LinearRegression(n_jobs=-1)
    else: #clfIndex == 2 or clfIndex == 3
        clf = RandomForestRegressor(max_features=0.33, n_jobs=-1, random_state=42, n_estimators=100)
    #SVR Classifier will be initialized later because the hyperparameters need to be fine-tuned
    
    average_r2 = 0
    average_rmse = 0
    average_absError = 0
        
    #Divide data into five folds
    kf = cross_validation.KFold(len(df), n_folds=5, shuffle=True, random_state=42)
    for train, test in kf: #cross validation to estimate accuracy
        #Train data = other four folds
        trainMat = df.iloc[train]
        trainMat_text = df.iloc[train].drop(dropped,axis=1)
        trainMat_target = df.iloc[train][trait]

        #Test data = selected fold
        testMat_text = df.iloc[test].drop(dropped,axis=1)
        testMat_target = df.iloc[test][trait]
        
        #Fine-tune the hyperparameters of SVR
        if clfIndex == 1 or clfIndex == 4:
            C, epsilon, gamma = calculate_svr_params(trainMat,labels,trait)
            clf = SVR(kernel='rbf', C=C, epsilon=epsilon, gamma=gamma)
        
        #PCA
        if clfIndex == 5 or clfIndex == 4 or clfIndex == 3:
            pca_components = calculate_num_pca(trainMat,clfIndex,labels,trait)
            print("pca_components",pca_components)
            if pca_components > 0:
                pca = PCA(n_components=pca_components)
                trainMat_text = pca.fit_transform(trainMat_text)
                testMat_text = pca.transform(testMat_text)

        try:
            #Train classifier on training data
            clf.fit(trainMat_text,trainMat_target)
            
            #Run classifier on test data
            predicted = clf.predict(testMat_text)
        except:
            print("Unexpected error while training classifier or predicting:", sys.exc_info()[0])
            return results
        
        #Evaluate how well classifier performed, save evaluation
        average_r2 += r2_score(testMat_target,predicted)
        average_rmse += math.sqrt(mean_squared_error(testMat_target,predicted))
        
        print('r2',r2_score(testMat_target,predicted))
                
        for i in range(len(testMat_target)):
            average_absError += abs(list(testMat_target)[i]-predicted[i])
                    
    #Report average performance of the classifier (across five folds)
    return write_results(results, df_label, clfIndex, trait, average_r2 / 5, average_rmse / 5, average_absError /5)

In [None]:
def write_results(results, df_label, clfIndex, trait, r2, rmse, absError):
    results = results.append({'label':df_label,'classifier':clfIndex,'trait':trait,'metric':'r2', 'result':r2}, 
                             ignore_index=True)
    results = results.append({'label':df_label,'classifier':clfIndex,'trait':trait,'metric':'rmse', 'result':rmse},
                             ignore_index=True)
    results = results.append({'label':df_label,'classifier':clfIndex,'trait':trait,'metric':'absError', 
                              'result':absError},ignore_index=True)
    results.to_csv(RESULTS_FILE_NAME)
    
    return results

In [None]:
def calculate_svr_params(trainMat,labels,trait):
    #http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7280664
    dropped = ['userid','bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']
    X = trainMat.drop(dropped,axis=1)
    y = trainMat[trait]
    
    kf = cross_validation.KFold(len(trainMat), n_folds=5, shuffle=True, random_state=42)
    param_dists = {'C': uniform(loc=pow(2,-2),scale=pow(2,15)-pow(2,-2)),
                   'gamma': uniform(loc=pow(2,-15),scale=pow(2,3)-pow(2,-15)),
                   'epsilon': uniform(loc=0,scale=pow(2,2))}
    search = RandomizedSearchCV(SVR(kernel='rbf'), param_distributions=param_dists, 
                                n_iter=40, cv=kf, random_state=42, verbose=False)

    #start = time.time()
    search.fit(X,y)
    #print("Randomized grid search took",time.time()-start,"seconds")
    print(search.best_params_)
    
    return (search.best_params_['C'],search.best_params_['epsilon'],search.best_params_['gamma'])

In [None]:
def calculate_num_pca(trainMat,clfIndex,labels,trait):
    dropped = ['userid','bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']
    if clfIndex == 0 or clfIndex == 5:
        clf = LinearRegression(n_jobs=-1)
    elif clfIndex == 1 or clfIndex == 4:
        clf = SVR()
    else: #clfIndex == 2 or clfIndex == 3
        clf = RandomForestRegressor(max_features=0.33, n_jobs=-1, random_state=42, n_estimators=100)
        
    best_r2 = -1
    best_pca = -1
        
    #For each potential PCA
    for pca_components in range(0,len(labels),1000):
        average_r2 = 0
        
        #Divide training data in 5 folds
        kf = cross_validation.KFold(len(trainMat), n_folds=5, shuffle=True, random_state=42)
        for sub_train, dev in kf:
            #Sub training data = other four folds
            subTrainMat_text = trainMat.iloc[sub_train].drop(dropped,axis=1)
            subTrainMat_target = trainMat.iloc[sub_train][trait]
            
            #Dev data = selected fold
            devMat_text = trainMat.iloc[dev].drop(dropped,axis=1)
            devMat_target = trainMat.iloc[dev][trait]

            if pca_components > 0:
                pca = PCA(n_components=pca_components)
                subTrainMat_text = pca.fit_transform(subTrainMat_text)
                devMat_text = pca.transform(devMat_text)

            try:
                #Train classifier on training data
                clf.fit(subTrainMat_text,subTrainMat_target)

                #Run classifier on test data
                predicted = clf.predict(devMat_text)
            except:
                print("Unexpected error while training classifier or predicting:", sys.exc_info()[0])
                return -1

            #Evaluate how well classifier performed, save evaluation
            average_r2 += r2_score(devMat_target,predicted)

        #Technically r2_values doesn't have averages (you would need to divide the values by 5 to get averages)
        #But since we're just comparing, it doesn't matter whether you divide them by 5 or not
        if best_r2 < 0 or average_r2 > best_r2:
            best_r2 = average_r2
            best_pca = pca_components
            
    return best_pca

In [None]:
def calculate_num_features(trainMat,labels,trait):
    clf = SVR()
    
    r2_values = {}
        
    #Divide training data in 5 folds
    kf = cross_validation.KFold(len(trainMat), n_folds=5, shuffle=True, random_state=42)
    for sub_train, dev in kf:
        #Sub training data = other four folds
        subTrainMat = trainMat.iloc[sub_train]
        subTrainMat_target = trainMat.iloc[sub_train][trait]

        #Dev data = selected fold
        devMat = trainMat.iloc[dev]
        devMat_target = trainMat.iloc[dev][trait]

        #Using sub training data only, rank the features by importance
        subTrainMat_chi = calculate_chi(subTrainMat,labels,trait)
        subTrainMat_chi['feature'] = subTrainMat_chi.index
        
        chi_sorted = list(subTrainMat_chi.sort_values(trait,ascending=False)['feature'])
        
        #For each potential value of x
        for x in range(0,len(labels),500):
            #Choose the top x features
            num_features = x
            if x == 0:
                num_features = len(labels)
            keep = chi_sorted[:num_features]
            subTrainMat_text = subTrainMat[[i for i in subTrainMat.columns.values if str(i) in keep]]
            devMat_text = devMat[[i for i in devMat.columns.values if str(i) in keep]]

            try:
                #Train classifier on training data
                clf.fit(subTrainMat_text,subTrainMat_target)

                #Run classifier on test data
                predicted = clf.predict(devMat_text)
            except:
                print("Unexpected error while training classifier or predicting:", sys.exc_info()[0])
                return -1

            #Evaluate how well classifier performed, save evaluation
            r2 = r2_score(devMat_target,predicted)

            #Evaluate how well x performed across 5 folds
            if num_features in r2_values:
                r2_values[num_features] += r2
            else:
                r2_values[num_features] = r2

    #Technically r2_values doesn't have averages (you would need to divide the values by 5 to get averages)
    #But since we're just comparing, it doesn't matter whether you divide them by 5 or not
                
    #Choose best x
    best_x = -1
    best_r2 = -1
    for x,average_r2 in r2_values.items():
        if best_x == -1 or average_r2 > best_r2:
            best_x = x
            best_r2 = average_r2
            
    return best_x

In [None]:
def predict_baseline(df, groundTruth, results, labels):
    full_start = time.time()
    cleaned_df = df[labels+['userid']]
    dropped = ['userid','bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']
    merged = cleaned_df.merge(groundTruth[dropped])
    
    #filter out nan values
    merged = merged.dropna(how='any')
                
    for trait in ['bfi_o','bfi_e','bfi_n','bfi_a','bfi_c']:
        print(trait)

        start = time.time()
        results = predict_one_baseline(merged,trait,results)
        print("One prediction took",time.time()-start,"seconds")

    print("Full prediction routine took",time.time()-full_start,"seconds")
    return results

In [None]:
def predict_one_baseline(df,trait,results):
    average_r2 = 0
    average_rmse = 0
    average_absError = 0

    kf = cross_validation.KFold(len(df), n_folds=5, shuffle=True, random_state=42)
    for train, test in kf: #cross validation to estimate accuracy
        trainMat_target = df.iloc[train][trait]
        testMat_target = df.iloc[test][trait]

        #Baseline: always predict the average value of the training matrix
        averageValue = sum(trainMat_target)/len(trainMat_target)
        predicted = [averageValue] * len(testMat_target)

        average_r2 += r2_score(testMat_target,predicted)
        average_rmse += math.sqrt(mean_squared_error(testMat_target,predicted))
                
        for i in range(len(testMat_target)):
            average_absError += abs(list(testMat_target)[i]-predicted[i])
            
    return write_results(results, 'baseline', -1, trait, average_r2 / 5, average_rmse / 5, average_absError / 5)

In [None]:
text_features = ['unigrams','bigrams','trigrams','pos_unigrams','pos_bigrams','pos_trigrams','char_unigrams',
                'char_bigrams','char_trigrams']
dfs = []
labels = []

for feature in text_features:
    df = pd.read_csv("data/text_features/"+feature+".csv")
    df = df.drop("Unnamed: 0",axis=1)
    dfs.append(df)
    
dfs[6]['userid'] = dfs[6]['id']
dfs[6] = dfs[6].drop('id',axis=1)

dfs[8]['userid'] = dfs[8]['id']
dfs[8] = dfs[8].drop('id',axis=1)

for df in dfs:
    labels.append(list(df.columns.values[:-1]))
    
groundTruth = pd.read_csv(GROUND_TRUTH_FILE)
groundTruth = groundTruth.drop('Unnamed: 0',axis=1)

In [None]:
image_features = ['basic','color','faceDetection','objects2','scene','sift','texture']

image_dfs = []
image_labels = []

for feature in image_features:
    df = pd.read_csv(IMAGE_FEATURE_FOLDER+'features_"+feature+".csv")
    if feature != 'basic':
        df = df.drop("Unnamed: 0",axis=1)
    df['userid'] = df['id']
    df = df.drop("id",axis=1)
    df = df.groupby('userid').mean()
    df['userid'] = df.index
    df = df.drop('imageNum',axis=1)
    image_dfs.append(df)
    
    image_labels.append(list(df.columns.values[:-1]))

In [None]:
results = predict_baseline(dfs[0],groundTruth,results,labels[0])

In [None]:
loop_start = time.time()
for index,feature in enumerate(text_features):
    print('***',feature)
    results = predict(dfs[index], feature, groundTruth, results, labels[index], svr=False, linearRegression=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
loop_start = time.time()
for index,feature in enumerate(image_features):
    print('***',feature)
    results = predict(image_dfs[index], feature, groundTruth, results, image_labels[index], 
                      svr=False, linearRegression=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
images_all = image_dfs[0].merge(image_dfs[1])
for df in image_dfs[2:]:
    images_all = images_all.merge(df)

images_all_labels = image_labels[0]
for labels in image_labels[1:]:
    images_all_labels += labels

In [None]:
results = predict(images_all, 'all', groundTruth, results, images_all_labels, 
                      svr=False, linearRegression=False)

In [None]:
text_all = dfs[0].merge(dfs[1],on='userid')
for df in dfs[2:]:
    text_all = text_all.merge(df,on='userid')

text_all_labels = [i for i in text_all.columns.values if i != 'userid']

In [None]:
results = predict(text_all, 'text_all', groundTruth, results, text_all_labels, 
                      svr=False, linearRegression=False)

In [None]:
all_features = text_all.merge(images_all,on='userid')
all_labels = [i for i in all_features.columns.values if i != 'userid']

In [None]:
results = predict(all_features, 'text_image_all', groundTruth, results, all_labels, 
                      svr=False, linearRegression=False)

In [None]:
loop_start = time.time()
for index,feature in enumerate(text_features):
    print('***',feature)
    results = predict(dfs[index], feature, groundTruth, results, labels[index], svr=False, randomForest=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
loop_start = time.time()
for index,feature in enumerate(image_features):
    print('***',feature)
    results = predict(image_dfs[index], feature, groundTruth, results, image_labels[index], 
                      svr=False, randomForest=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
results = predict(images_all, 'all', groundTruth, results, images_all_labels, 
                      svr=False, randomForest=False)

In [None]:
results = predict(text_all, 'text_all', groundTruth, results, text_all_labels, 
                      svr=False, randomForest=False)

In [None]:
results = predict(all_features, 'text_image_all', groundTruth, results, all_labels, 
                      svr=False, randomForest=False)

In [None]:
loop_start = time.time()
for index,feature in enumerate(text_features):
    print('***',feature)
    results = predict(dfs[index], feature, groundTruth, results, labels[index], svr=False, randomForest=False,
                     linearRegression=False,svr_pca=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
loop_start = time.time()
for index,feature in enumerate(image_features):
    print('***',feature)
    results = predict(image_dfs[index], feature, groundTruth, results, image_labels[index], 
                      svr=False, randomForest=False, linearRegression=False, svr_pca=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
results = predict(images_all, 'all', groundTruth, results, images_all_labels, 
                      svr=False, randomForest=False, linearRegression=False,svr_pca=False)

In [None]:
results = predict(text_all, 'text_all', groundTruth, results, text_all_labels, 
                      svr=False, randomForest=False,linearRegression=False,svr_pca=False)

In [None]:
results = predict(all_features, 'text_image_all', groundTruth, results, all_labels, 
                      svr=False, randomForest=False, linearRegression=False,svr_pca=False)

In [None]:
loop_start = time.time()
for index,feature in enumerate(image_features):
    print('***',feature)
    results = predict(image_dfs[index], feature, groundTruth, results, image_labels[index], 
                      randomForest=False, randomForest_pca=False, linearRegression=False, svr_pca=False,
                      linearRegression_pca=False)
print("Entire loop took",time.time()-loop_start,"seconds")

In [None]:
loop_start = time.time()
for index,feature in enumerate(text_features):
    print('***',feature)
    results = predict(dfs[index], feature, groundTruth, results, labels[index],
                      randomForest=False, randomForest_pca=False, linearRegression=False, svr_pca=False,
                      linearRegression_pca=False)
print("Entire loop took",time.time()-loop_start,"seconds")