# Data Processing Notebook

In this notebook we showcase data processing steps done unto data upon retrieval from Kaggle.

### Download and Import Data (48.1s)

In [41]:
import os
import shutil
import kagglehub

# Download dataset from kagglehub
path = kagglehub.dataset_download("tunguz/big-five-personality-test")
print(path)
print("Dataset path:", path)

# Specify target directory for data
target_folder = 'data/'
os.makedirs(target_folder, exist_ok=True)

# Check the contents of the dataset path
print("Files in the dataset:", os.listdir(path))

for file_name in os.listdir(path):
    source = os.path.join(path, file_name)
    destination = os.path.join(target_folder, file_name)
    
    # Check if the file exists
    if os.path.exists(source):
        shutil.move(source, destination)
        print(f"Moved {file_name} to {target_folder}")
    else:
        print(f"File does not exist: {source}")

print("Dataset moved to:", target_folder)


/Users/juliann/.cache/kagglehub/datasets/tunguz/big-five-personality-test/versions/1
Dataset path: /Users/juliann/.cache/kagglehub/datasets/tunguz/big-five-personality-test/versions/1
Files in the dataset: []
Dataset moved to: data/


In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub
import shutil
import os

### Clean into desired format (~40s)

In [43]:
data = pd.read_csv("../data/IPIP-FFM-data-8Nov2018/data-final.csv")
rows = []
cols = data.columns[0].split('\t')
for index in range(len(data)):
    value = data.iloc[index, 0]
    row = (value.split('\t'))
    rows.append(row)
data = pd.DataFrame(rows, columns=cols)

In [44]:
print(len(data))

1015341


### Normalize Data Types (~90s)

In [45]:
non_numeric_cols = ['dateload', 'country']
numeric_cols = [col for col in data.columns if col not in non_numeric_cols]

# Convert only the selected columns to numeric
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

### Filter for only 1 user per entry (more accurate)

In [46]:
# Filter the data where 'ipc' column is equal to 1
data = data[data['IPC'] == 1]

In [47]:
print(len(data))

696845


### Set up trait score columns

In [48]:
import pandas as pd
import numpy as np

# Define score column names
score_cols = ["O score", "C score", "E score", "A score", "N score"]

# Efficiently add all new columns at once with zero values
data = pd.concat([data, pd.DataFrame(0, index=data.index, columns=score_cols)], axis=1)
data


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,EST1,EST2,EST3,EST4,EST5,EST6,EST7,EST8,EST9,EST10,AGR1,AGR2,AGR3,AGR4,AGR5,AGR6,AGR7,AGR8,AGR9,AGR10,CSN1,CSN2,CSN3,CSN4,CSN5,CSN6,CSN7,CSN8,CSN9,CSN10,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10,EXT1_E,EXT2_E,EXT3_E,EXT4_E,EXT5_E,EXT6_E,EXT7_E,EXT8_E,EXT9_E,EXT10_E,EST1_E,EST2_E,EST3_E,EST4_E,EST5_E,EST6_E,EST7_E,EST8_E,EST9_E,EST10_E,AGR1_E,AGR2_E,AGR3_E,AGR4_E,AGR5_E,AGR6_E,AGR7_E,AGR8_E,AGR9_E,AGR10_E,CSN1_E,CSN2_E,CSN3_E,CSN4_E,CSN5_E,CSN6_E,CSN7_E,CSN8_E,CSN9_E,CSN10_E,OPN1_E,OPN2_E,OPN3_E,OPN4_E,OPN5_E,OPN6_E,OPN7_E,OPN8_E,OPN9_E,OPN10_E,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err,O score,C score,E score,A score,N score
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,5.0,2.0,4.0,2.0,3.0,2.0,4.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0,4.0,4.0,2.0,4.0,4.0,5.0,1.0,4.0,1.0,4.0,1.0,5.0,3.0,4.0,5.0,9419.0,5491.0,3959.0,4821.0,5611.0,2756.0,2388.0,2113.0,5900.0,4110.0,6135.0,4150.0,5739.0,6364.0,3663.0,5070.0,5709.0,4285.0,2587.0,3997.0,4750.0,5475.0,11641.0,3115.0,3207.0,3260.0,10235.0,5897.0,1758.0,3081.0,6602.0,5457.0,1569.0,2129.0,3762.0,4420.0,9382.0,5286.0,4983.0,6339.0,3146.0,4067.0,2959.0,3411.0,2170.0,4920.0,4436.0,3116.0,2992.0,4354.0,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6,1,GB,51.5448,0.1991,0,0,0,0,0
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,2.0,3.0,4.0,1.0,3.0,1.0,2.0,1.0,3.0,1.0,1.0,4.0,1.0,5.0,1.0,5.0,3.0,4.0,5.0,3.0,3.0,2.0,5.0,3.0,3.0,1.0,3.0,3.0,5.0,3.0,1.0,2.0,4.0,2.0,3.0,1.0,4.0,2.0,5.0,3.0,7235.0,3598.0,3315.0,2564.0,2976.0,3050.0,4787.0,3228.0,3465.0,3309.0,9036.0,2406.0,3484.0,3359.0,3061.0,2539.0,4226.0,2962.0,1799.0,1607.0,2158.0,2090.0,2143.0,2807.0,3422.0,5324.0,4494.0,3627.0,1850.0,1747.0,5163.0,5240.0,7208.0,2783.0,4103.0,3431.0,3347.0,2399.0,3360.0,5595.0,2624.0,4985.0,1684.0,3026.0,4742.0,3336.0,2718.0,3374.0,3096.0,3019.0,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11,1,MY,3.1698,101.7060,0,0,0,0,0
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,4.0,1.0,4.0,2.0,4.0,1.0,4.0,4.0,3.0,4.0,2.0,2.0,2.0,3.0,3.0,4.0,2.0,4.0,2.0,5.0,1.0,2.0,1.0,4.0,2.0,5.0,3.0,4.0,4.0,4657.0,3549.0,2543.0,3335.0,5847.0,2540.0,4922.0,3142.0,14621.0,2191.0,5128.0,3675.0,3442.0,4546.0,8275.0,2185.0,2164.0,1175.0,3813.0,1593.0,1089.0,2203.0,3386.0,1464.0,2562.0,1493.0,3067.0,13719.0,3892.0,4100.0,4286.0,4775.0,2713.0,2813.0,4237.0,6308.0,2690.0,1516.0,2379.0,2983.0,1930.0,1470.0,1644.0,1683.0,2229.0,8114.0,2043.0,6295.0,1585.0,2529.0,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7,1,GB,54.9119,-1.3833,0,0,0,0,0
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,4.0,3.0,2.0,4.0,3.0,4.0,2.0,4.0,2.0,4.0,3.0,4.0,2.0,4.0,4.0,4.0,1.0,2.0,2.0,3.0,1.0,4.0,4.0,2.0,5.0,2.0,3.0,1.0,4.0,4.0,3.0,3.0,3996.0,2896.0,5096.0,4240.0,5168.0,5456.0,4360.0,4496.0,5240.0,4000.0,3736.0,4616.0,3015.0,2711.0,3960.0,4064.0,4208.0,2936.0,7336.0,3896.0,6062.0,11952.0,1040.0,2264.0,3664.0,3049.0,4912.0,7545.0,4632.0,6896.0,2824.0,520.0,2368.0,3225.0,2848.0,6264.0,3760.0,10472.0,3192.0,7704.0,3456.0,6665.0,1977.0,3728.0,4128.0,3776.0,2984.0,4192.0,3480.0,3257.0,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7,1,GB,51.7500,-1.2500,0,0,0,0,0
5,3.0,3.0,4.0,2.0,4.0,2.0,2.0,3.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,3.0,1.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,3.0,2.0,4.0,1.0,3.0,2.0,4.0,3.0,4.0,3.0,5.0,1.0,5.0,1.0,3.0,1.0,5.0,4.0,5.0,2.0,4834.0,5064.0,1160.0,2664.0,6711.0,3344.0,2512.0,6264.0,6992.0,4592.0,2808.0,1776.0,3280.0,4520.0,2640.0,5408.0,3647.0,3183.0,1575.0,672.0,6375.0,4727.0,3775.0,1647.0,1233.0,8694.0,2904.0,2152.0,2856.0,2848.0,4288.0,4360.0,7328.0,3976.0,7895.0,2640.0,1760.0,5720.0,9032.0,3928.0,2104.0,5488.0,3656.0,4352.0,2681.0,3272.0,2640.0,1568.0,1640.0,3192.0,2016-03-03 02:03:12,1600.0,1000.0,4.0,196.0,3,1,SE,59.3333,18.0500,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015334,4.0,1.0,4.0,2.0,4.0,3.0,3.0,2.0,3.0,2.0,2.0,4.0,3.0,2.0,5.0,2.0,4.0,3.0,1.0,2.0,1.0,4.0,4.0,5.0,2.0,5.0,3.0,4.0,5.0,3.0,3.0,2.0,3.0,4.0,3.0,2.0,4.0,3.0,4.0,4.0,3.0,3.0,5.0,3.0,4.0,2.0,5.0,2.0,3.0,4.0,4916.0,2861.0,3960.0,3368.0,2828.0,2814.0,2315.0,3202.0,5529.0,2585.0,3309.0,4060.0,2599.0,10593.0,2545.0,4542.0,2050.0,3853.0,6665.0,1994.0,4729.0,3517.0,2232.0,6358.0,5090.0,1822.0,11459.0,8763.0,4244.0,5472.0,6835.0,3280.0,4365.0,2884.0,2605.0,3799.0,2799.0,2192.0,1256.0,5690.0,4615.0,6676.0,2559.0,4325.0,1792.0,4317.0,4452.0,2053.0,6474.0,3851.0,2018-11-08 12:02:14,2560.0,1080.0,36.0,209.0,16,1,GB,52.4189,-1.5054,0,0,0,0,0
1015335,5.0,1.0,5.0,2.0,4.0,2.0,3.0,1.0,5.0,1.0,5.0,2.0,5.0,2.0,5.0,4.0,5.0,5.0,5.0,4.0,1.0,5.0,5.0,4.0,2.0,4.0,2.0,3.0,4.0,5.0,2.0,5.0,4.0,3.0,3.0,4.0,3.0,3.0,2.0,4.0,4.0,1.0,4.0,2.0,3.0,2.0,5.0,3.0,4.0,4.0,11415.0,2950.0,3521.0,4432.0,2166.0,1765.0,4969.0,3181.0,4134.0,2266.0,5985.0,4881.0,1883.0,4266.0,6166.0,1818.0,6268.0,2884.0,2033.0,2669.0,25270.0,1734.0,1862.0,3117.0,5320.0,5248.0,8548.0,5834.0,2534.0,5018.0,5947.0,2484.0,24037.0,3618.0,2129.0,3001.0,2050.0,12751.0,3049.0,6216.0,2117.0,6267.0,1717.0,2834.0,3771.0,3350.0,2552.0,4383.0,5401.0,1615.0,2018-11-08 12:02:38,375.0,667.0,10.0,245.0,9,1,TR,41.0186,28.9647,0,0,0,0,0
1015337,4.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,5.0,1.0,5.0,5.0,4.0,4.0,4.0,5.0,2.0,4.0,1.0,4.0,3.0,5.0,3.0,3.0,5.0,3.0,3.0,2.0,3.0,4.0,3.0,3.0,2.0,3.0,2.0,3.0,4.0,1.0,5.0,1.0,5.0,1.0,3.0,4.0,5.0,4.0,2382.0,1984.0,3696.0,1736.0,2272.0,2327.0,2088.0,2296.0,1585.0,3287.0,1752.0,1728.0,1279.0,1768.0,4503.0,1329.0,1640.0,1464.0,2183.0,953.0,2422.0,1448.0,3216.0,6160.0,2208.0,1513.0,2785.0,3833.0,3280.0,1184.0,2096.0,1880.0,3209.0,1744.0,4392.0,1943.0,2263.0,1559.0,1304.0,2176.0,2560.0,6632.0,2312.0,2376.0,2969.0,2271.0,4064.0,1144.0,2936.0,1615.0,2018-11-08 12:07:18,1920.0,1080.0,3.0,122.0,7,1,US,38.0000,-97.0000,0,0,0,0,0
1015339,2.0,4.0,3.0,4.0,2.0,2.0,1.0,4.0,2.0,4.0,4.0,3.0,4.0,2.0,4.0,4.0,2.0,2.0,4.0,4.0,2.0,3.0,2.0,4.0,3.0,4.0,2.0,4.0,4.0,3.0,4.0,2.0,4.0,2.0,2.0,2.0,4.0,2.0,4.0,4.0,5.0,2.0,4.0,2.0,3.0,2.0,4.0,5.0,5.0,3.0,8647.0,2664.0,3544.0,2263.0,2248.0,3024.0,4704.0,3383.0,2488.0,3183.0,2423.0,5807.0,1384.0,2696.0,2800.0,1847.0,4655.0,1232.0,5064.0,4921.0,2982.0,5584.0,2567.0,2168.0,6320.0,3055.0,2580.0,2816.0,2544.0,3744.0,5168.0,3903.0,37726.0,2735.0,1367.0,5056.0,3216.0,3320.0,2263.0,1415.0,5024.0,4664.0,4792.0,6471.0,1873.0,3136.0,3129.0,2799.0,7184.0,2526.0,2018-11-08 12:08:34,1920.0,1080.0,6.0,212.0,8,1,US,34.1067,-117.8067,0,0,0,0,0


### Score traits based on scoring rubic (scource: ipip.ori.org)

Sum survey answers by trait into total score columns (~30 min)

In [49]:
import pandas as pd
from tqdm import tqdm

# Ensure all columns are displayed
pd.set_option("display.max_columns", None)

# Enable tqdm for Pandas `apply()`
tqdm.pandas(desc="Processing rows")

mapping = {'O score': 'OPN', 'C score': 'CSN', 'E score': 'EXT', 'A score': 'AGR', 'N score': 'EST'}
survey_cols = data.columns[:50]  # Limit to first 50 columns

# Define function to sum scores for a row
def compute_scores(row):
    for col, code in mapping.items():
        relevant_cols = [c for c in survey_cols if c.startswith(code)]
        row[col] = row[relevant_cols].sum()
    return row

# Apply function row-wise with progress bar
data = data.progress_apply(compute_scores, axis=1)




Processing rows: 100%|██████████| 696845/696845 [45:21<00:00, 256.08it/s]   


In [50]:
data

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,EST1,EST2,EST3,EST4,EST5,EST6,EST7,EST8,EST9,EST10,AGR1,AGR2,AGR3,AGR4,AGR5,AGR6,AGR7,AGR8,AGR9,AGR10,CSN1,CSN2,CSN3,CSN4,CSN5,CSN6,CSN7,CSN8,CSN9,CSN10,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10,EXT1_E,EXT2_E,EXT3_E,EXT4_E,EXT5_E,EXT6_E,EXT7_E,EXT8_E,EXT9_E,EXT10_E,EST1_E,EST2_E,EST3_E,EST4_E,EST5_E,EST6_E,EST7_E,EST8_E,EST9_E,EST10_E,AGR1_E,AGR2_E,AGR3_E,AGR4_E,AGR5_E,AGR6_E,AGR7_E,AGR8_E,AGR9_E,AGR10_E,CSN1_E,CSN2_E,CSN3_E,CSN4_E,CSN5_E,CSN6_E,CSN7_E,CSN8_E,CSN9_E,CSN10_E,OPN1_E,OPN2_E,OPN3_E,OPN4_E,OPN5_E,OPN6_E,OPN7_E,OPN8_E,OPN9_E,OPN10_E,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err,O score,C score,E score,A score,N score
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,5.0,2.0,4.0,2.0,3.0,2.0,4.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0,4.0,4.0,2.0,4.0,4.0,5.0,1.0,4.0,1.0,4.0,1.0,5.0,3.0,4.0,5.0,9419.0,5491.0,3959.0,4821.0,5611.0,2756.0,2388.0,2113.0,5900.0,4110.0,6135.0,4150.0,5739.0,6364.0,3663.0,5070.0,5709.0,4285.0,2587.0,3997.0,4750.0,5475.0,11641.0,3115.0,3207.0,3260.0,10235.0,5897.0,1758.0,3081.0,6602.0,5457.0,1569.0,2129.0,3762.0,4420.0,9382.0,5286.0,4983.0,6339.0,3146.0,4067.0,2959.0,3411.0,2170.0,4920.0,4436.0,3116.0,2992.0,4354.0,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6,1,GB,51.5448,0.1991,33.0,32.0,30.0,31.0,24.0
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,2.0,3.0,4.0,1.0,3.0,1.0,2.0,1.0,3.0,1.0,1.0,4.0,1.0,5.0,1.0,5.0,3.0,4.0,5.0,3.0,3.0,2.0,5.0,3.0,3.0,1.0,3.0,3.0,5.0,3.0,1.0,2.0,4.0,2.0,3.0,1.0,4.0,2.0,5.0,3.0,7235.0,3598.0,3315.0,2564.0,2976.0,3050.0,4787.0,3228.0,3465.0,3309.0,9036.0,2406.0,3484.0,3359.0,3061.0,2539.0,4226.0,2962.0,1799.0,1607.0,2158.0,2090.0,2143.0,2807.0,3422.0,5324.0,4494.0,3627.0,1850.0,1747.0,5163.0,5240.0,7208.0,2783.0,4103.0,3431.0,3347.0,2399.0,3360.0,5595.0,2624.0,4985.0,1684.0,3026.0,4742.0,3336.0,2718.0,3374.0,3096.0,3019.0,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11,1,MY,3.1698,101.7060,27.0,31.0,34.0,32.0,21.0
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,4.0,1.0,4.0,2.0,4.0,1.0,4.0,4.0,3.0,4.0,2.0,2.0,2.0,3.0,3.0,4.0,2.0,4.0,2.0,5.0,1.0,2.0,1.0,4.0,2.0,5.0,3.0,4.0,4.0,4657.0,3549.0,2543.0,3335.0,5847.0,2540.0,4922.0,3142.0,14621.0,2191.0,5128.0,3675.0,3442.0,4546.0,8275.0,2185.0,2164.0,1175.0,3813.0,1593.0,1089.0,2203.0,3386.0,1464.0,2562.0,1493.0,3067.0,13719.0,3892.0,4100.0,4286.0,4775.0,2713.0,2813.0,4237.0,6308.0,2690.0,1516.0,2379.0,2983.0,1930.0,1470.0,1644.0,1683.0,2229.0,8114.0,2043.0,6295.0,1585.0,2529.0,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7,1,GB,54.9119,-1.3833,31.0,28.0,29.0,28.0,26.0
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,4.0,3.0,2.0,4.0,3.0,4.0,2.0,4.0,2.0,4.0,3.0,4.0,2.0,4.0,4.0,4.0,1.0,2.0,2.0,3.0,1.0,4.0,4.0,2.0,5.0,2.0,3.0,1.0,4.0,4.0,3.0,3.0,3996.0,2896.0,5096.0,4240.0,5168.0,5456.0,4360.0,4496.0,5240.0,4000.0,3736.0,4616.0,3015.0,2711.0,3960.0,4064.0,4208.0,2936.0,7336.0,3896.0,6062.0,11952.0,1040.0,2264.0,3664.0,3049.0,4912.0,7545.0,4632.0,6896.0,2824.0,520.0,2368.0,3225.0,2848.0,6264.0,3760.0,10472.0,3192.0,7704.0,3456.0,6665.0,1977.0,3728.0,4128.0,3776.0,2984.0,4192.0,3480.0,3257.0,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7,1,GB,51.7500,-1.2500,31.0,27.0,26.0,32.0,27.0
5,3.0,3.0,4.0,2.0,4.0,2.0,2.0,3.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,3.0,1.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,3.0,2.0,4.0,1.0,3.0,2.0,4.0,3.0,4.0,3.0,5.0,1.0,5.0,1.0,3.0,1.0,5.0,4.0,5.0,2.0,4834.0,5064.0,1160.0,2664.0,6711.0,3344.0,2512.0,6264.0,6992.0,4592.0,2808.0,1776.0,3280.0,4520.0,2640.0,5408.0,3647.0,3183.0,1575.0,672.0,6375.0,4727.0,3775.0,1647.0,1233.0,8694.0,2904.0,2152.0,2856.0,2848.0,4288.0,4360.0,7328.0,3976.0,7895.0,2640.0,1760.0,5720.0,9032.0,3928.0,2104.0,5488.0,3656.0,4352.0,2681.0,3272.0,2640.0,1568.0,1640.0,3192.0,2016-03-03 02:03:12,1600.0,1000.0,4.0,196.0,3,1,SE,59.3333,18.0500,32.0,29.0,30.0,28.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015334,4.0,1.0,4.0,2.0,4.0,3.0,3.0,2.0,3.0,2.0,2.0,4.0,3.0,2.0,5.0,2.0,4.0,3.0,1.0,2.0,1.0,4.0,4.0,5.0,2.0,5.0,3.0,4.0,5.0,3.0,3.0,2.0,3.0,4.0,3.0,2.0,4.0,3.0,4.0,4.0,3.0,3.0,5.0,3.0,4.0,2.0,5.0,2.0,3.0,4.0,4916.0,2861.0,3960.0,3368.0,2828.0,2814.0,2315.0,3202.0,5529.0,2585.0,3309.0,4060.0,2599.0,10593.0,2545.0,4542.0,2050.0,3853.0,6665.0,1994.0,4729.0,3517.0,2232.0,6358.0,5090.0,1822.0,11459.0,8763.0,4244.0,5472.0,6835.0,3280.0,4365.0,2884.0,2605.0,3799.0,2799.0,2192.0,1256.0,5690.0,4615.0,6676.0,2559.0,4325.0,1792.0,4317.0,4452.0,2053.0,6474.0,3851.0,2018-11-08 12:02:14,2560.0,1080.0,36.0,209.0,16,1,GB,52.4189,-1.5054,34.0,32.0,28.0,36.0,28.0
1015335,5.0,1.0,5.0,2.0,4.0,2.0,3.0,1.0,5.0,1.0,5.0,2.0,5.0,2.0,5.0,4.0,5.0,5.0,5.0,4.0,1.0,5.0,5.0,4.0,2.0,4.0,2.0,3.0,4.0,5.0,2.0,5.0,4.0,3.0,3.0,4.0,3.0,3.0,2.0,4.0,4.0,1.0,4.0,2.0,3.0,2.0,5.0,3.0,4.0,4.0,11415.0,2950.0,3521.0,4432.0,2166.0,1765.0,4969.0,3181.0,4134.0,2266.0,5985.0,4881.0,1883.0,4266.0,6166.0,1818.0,6268.0,2884.0,2033.0,2669.0,25270.0,1734.0,1862.0,3117.0,5320.0,5248.0,8548.0,5834.0,2534.0,5018.0,5947.0,2484.0,24037.0,3618.0,2129.0,3001.0,2050.0,12751.0,3049.0,6216.0,2117.0,6267.0,1717.0,2834.0,3771.0,3350.0,2552.0,4383.0,5401.0,1615.0,2018-11-08 12:02:38,375.0,667.0,10.0,245.0,9,1,TR,41.0186,28.9647,32.0,33.0,29.0,35.0,42.0
1015337,4.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,5.0,1.0,5.0,5.0,4.0,4.0,4.0,5.0,2.0,4.0,1.0,4.0,3.0,5.0,3.0,3.0,5.0,3.0,3.0,2.0,3.0,4.0,3.0,3.0,2.0,3.0,2.0,3.0,4.0,1.0,5.0,1.0,5.0,1.0,3.0,4.0,5.0,4.0,2382.0,1984.0,3696.0,1736.0,2272.0,2327.0,2088.0,2296.0,1585.0,3287.0,1752.0,1728.0,1279.0,1768.0,4503.0,1329.0,1640.0,1464.0,2183.0,953.0,2422.0,1448.0,3216.0,6160.0,2208.0,1513.0,2785.0,3833.0,3280.0,1184.0,2096.0,1880.0,3209.0,1744.0,4392.0,1943.0,2263.0,1559.0,1304.0,2176.0,2560.0,6632.0,2312.0,2376.0,2969.0,2271.0,4064.0,1144.0,2936.0,1615.0,2018-11-08 12:07:18,1920.0,1080.0,3.0,122.0,7,1,US,38.0000,-97.0000,33.0,28.0,34.0,33.0,40.0
1015339,2.0,4.0,3.0,4.0,2.0,2.0,1.0,4.0,2.0,4.0,4.0,3.0,4.0,2.0,4.0,4.0,2.0,2.0,4.0,4.0,2.0,3.0,2.0,4.0,3.0,4.0,2.0,4.0,4.0,3.0,4.0,2.0,4.0,2.0,2.0,2.0,4.0,2.0,4.0,4.0,5.0,2.0,4.0,2.0,3.0,2.0,4.0,5.0,5.0,3.0,8647.0,2664.0,3544.0,2263.0,2248.0,3024.0,4704.0,3383.0,2488.0,3183.0,2423.0,5807.0,1384.0,2696.0,2800.0,1847.0,4655.0,1232.0,5064.0,4921.0,2982.0,5584.0,2567.0,2168.0,6320.0,3055.0,2580.0,2816.0,2544.0,3744.0,5168.0,3903.0,37726.0,2735.0,1367.0,5056.0,3216.0,3320.0,2263.0,1415.0,5024.0,4664.0,4792.0,6471.0,1873.0,3136.0,3129.0,2799.0,7184.0,2526.0,2018-11-08 12:08:34,1920.0,1080.0,6.0,212.0,8,1,US,34.1067,-117.8067,35.0,30.0,28.0,31.0,33.0


### Remove row's containing NaNs in survey answers

In [51]:
survey_answer_cols = data.iloc[:, :50]
top_nan_cols = survey_answer_cols.isna().sum().sort_values(ascending=False).head(10)

data_cleaned = data.loc[survey_answer_cols.dropna().index]
data = data_cleaned

### Save full dataset and trait scores to local CSV files

In [52]:
data.to_csv('../data/cleaned_data_v2.csv')
#data.iloc[:, -5:].to_csv("../data/trait_scores.csv", index=False)