In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from preprocessing_helper import *

### 1. Get directories related information and load in MoonBoard 2016 data

In [2]:
cwd = os.getcwd()
parent_wd = cwd.replace('/preprocessing', '')
raw_data_path = parent_wd + '/raw_data/moonGen_scrape_2016_final'
hold_feature_path = parent_wd + '/raw_data/HoldFeature2016.csv'

In [3]:
with open(raw_data_path, 'rb') as f:
    MoonBoard_2016_raw = pickle.load(f)

In [4]:
features = pd.read_csv(hold_feature_path, dtype=str)

# convert features from pd dataframe to dictionary
feature_dict = {}
for index in features.index:
    feature_item = features.loc[index]
    feature_dict[(int(feature_item['X_coord']), int(feature_item['Y_coord']))] = np.array(
        list(feature_item['Difficulties'])).astype(int)

### 2. Separate data into different categories:
- no user ratings
- benchmarked with user ratings
- non-benchmarked with user ratings

In [185]:
def classify_and_reorganize_data(raw_data, save_path, delta_xy_mode = False, print_result = False):
    """
    Input:
    - raw_data: the raw data that is scraped from MoonBoard
    - delta_xy_mode: 
      if set False(default), the x_vector compiled will be of shape (10, n_holds) (6 hold features, x, y, is_start, is_end)
      if set True, the x_vector compiled will be of shape (14, n_holds) (6 hold features, x, y, dx1, dy1, dx2, dy2, is_start, is_end)
    
    Classify and process the raw data into 4 caterogies/8 dictionaries:
    - X_dict_benchmark_withgrade: the input data that is benchmarked and contains user grading.
    - Y_dict_benchmark_withgrade: the output for raw data that is benchmarked and contains user grading.
    - X_dict_benchmark_nograde: the input data that is benchmarked and does not contain user grading.
    - Y_dict_benchmark_nograde: the output for raw data that is benchmarked and does not contain user grading.
    - X_dict_withgrade: the input data that is not benchmarked and contains user grading.
    - Y_dict_withgrade: the output for raw data that is not benchmarked and contains user grading.
    - X_dict_nograde: the input data that is not benchmarked and does not contain user grading.
    - Y_dict_nograde: the output for raw data that is not benchmarked and does not contain user grading.
    For the ones that do not have user grading, the shape of each item in Y_dict is (2, 1): (grade, is_benchmarked)
    For the ones that have user grading, the shape of each item in Y_dict is (3, 1): (grade, is_benchmarked, user_grade)
    """
    X_dict_benchmark_withgrade = {}
    Y_dict_benchmark_withgrade = {}
    X_dict_benchmark_nograde = {}
    Y_dict_benchmark_nograde = {}
    X_dict_withgrade = {}
    Y_dict_withgrade = {}
    X_dict_nograde = {}
    Y_dict_nograde = {}
    list_fail = []
    for key, item in raw_data.items():
        # create x_vector
        try:
            n_start = len(item['start'])
            n_mid = len(item['mid'])
            n_end = len(item['end'])
            
            assert(n_start <= 2)
            assert(n_end <= 2)
            
            n_hold = n_start + n_mid + n_end
            item['start'].sort(key = lambda x: x[1])
            item['mid'].sort(key = lambda x: x[1])
            item['end'].sort(key = lambda x: x[1])
            combined_list = item['start'] + item['mid'] + item['end']

            if delta_xy_mode:
                x_vectors = np.zeros((14, n_hold))
                for i, (x, y) in enumerate(combined_list):
                    x_vectors[0:6, i] = feature_dict[(x, y)] # 6 hand features
                    x_vectors[6:8, i] = [x, y] #(x, y)
                    if i == 0:
                        pass
                    elif i == 1:
                        x_vectors[8:10, i] = x_vectors[6:8, i] - x_vectors[6:8, i-1]
                    else:
                        x_vectors[8:10, i] = x_vectors[6:8, i] - x_vectors[6:8, i-1]
                        x_vectors[10:12, i] = x_vectors[6:8, i] - x_vectors[6:8, i-2]
                x_vectors[12:, 0:n_start] = np.array([[1], [0]])
                x_vectors[12:, n_start+n_mid:] = np.array([[0], [1]])

            else:
                x_vectors = np.zeros((10, n_hold))
                for i, (x, y) in enumerate(combined_list):
                    x_vectors[0:6, i] = feature_dict[(x, y)] # 6 hand features
                    x_vectors[6:8, i] = [x, y] #(x, y)
                x_vectors[8:, 0:n_start] = np.array([[1], [0]])
                x_vectors[8:, n_start+n_mid:] = np.array([[0], [1]])

            # save x_vector into the correct dictionary
            if item['is_benchmark']:
                if item['user_grade'] is None:
                    X_dict_benchmark_nograde[key] = x_vectors
                    Y_dict_benchmark_nograde[key] = np.array([[grade_map[item['grade']]], 
                                                                [int(item['is_benchmark'])]])
                else:
                    X_dict_benchmark_withgrade[key] = x_vectors
                    Y_dict_benchmark_withgrade[key] = np.array([[grade_map[item['grade']]], 
                                                                [int(item['is_benchmark'])], 
                                                                [grade_map[item['user_grade']]]])
            elif item['user_grade'] is None:
                X_dict_nograde[key] = x_vectors
                Y_dict_nograde[key] = np.array([[grade_map[item['grade']]], 
                                                  [int(item['is_benchmark'])]])
            else:
                X_dict_withgrade[key] = x_vectors
                Y_dict_withgrade[key] = np.array([[grade_map[item['grade']]], 
                                                  [int(item['is_benchmark'])],
                                                  [grade_map[item['user_grade']]]])
            if print_result:
                print('Complete processing of %s' %key)
            
        except:
            print('Raw data with key %s contains error' %key)
            list_fail.append(key)

    output = {'X_dict_benchmark_withgrade': X_dict_benchmark_withgrade,
              'Y_dict_benchmark_withgrade': Y_dict_benchmark_withgrade, 
              'X_dict_benchmark_nograde': X_dict_benchmark_nograde, 
              'Y_dict_benchmark_nograde': Y_dict_benchmark_nograde, 
              'X_dict_withgrade': X_dict_withgrade, 
              'Y_dict_withgrade': Y_dict_withgrade, 
              'X_dict_nograde': X_dict_nograde, 
              'Y_dict_nograde': Y_dict_nograde, 
              'list_fail': list_fail}
    
    save_pickle(output, save_path)
    print('result saved.')
    return output

In [172]:
grade_map = get_grade_map()

In [181]:
output_xy_mode['list_fail']

['307940',
 '307837',
 '307748',
 '307711',
 '307661',
 '307602',
 '307599',
 '304387']

In [186]:
save_path = cwd + '/processed_data_deltaxy_mode'
output_deltaxy_mode = classify_and_reorganize_data(MoonBoard_2016_raw, save_path, delta_xy_mode = False)

Raw data with key 307940 contains error
Raw data with key 307837 contains error
Raw data with key 307748 contains error
Raw data with key 307711 contains error
Raw data with key 307661 contains error
Raw data with key 307602 contains error
Raw data with key 307599 contains error
Raw data with key 304387 contains error
result saved.


In [187]:
save_path = cwd + '/processed_data_xy_mode'
output_xy_mode = classify_and_reorganize_data(MoonBoard_2016_raw, save_path, delta_xy_mode = False)

Raw data with key 307940 contains error
Raw data with key 307837 contains error
Raw data with key 307748 contains error
Raw data with key 307711 contains error
Raw data with key 307661 contains error
Raw data with key 307602 contains error
Raw data with key 307599 contains error
Raw data with key 304387 contains error
result saved.


### 3. Data Preprocessing with sequence generator

In [6]:
raw_data_forseq_path = parent_wd + '/preprocessing/processed_data_deltaxy_mode'
with open(raw_data_forseq_path, 'rb') as f:
    MoonBoard_2016_raw_forseq = pickle.load(f)

In [9]:
X_dict_merge = {**MoonBoard_2016_raw_forseq['X_dict_benchmark_withgrade'], 
                **MoonBoard_2016_raw_forseq['X_dict_benchmark_nograde'], 
                **MoonBoard_2016_raw_forseq['X_dict_withgrade'], 
                **MoonBoard_2016_raw_forseq['X_dict_nograde']}
Y_dict_merge = {**MoonBoard_2016_raw_forseq['Y_dict_benchmark_withgrade'], 
                **MoonBoard_2016_raw_forseq['Y_dict_benchmark_nograde'], 
                **MoonBoard_2016_raw_forseq['Y_dict_withgrade'], 
                **MoonBoard_2016_raw_forseq['Y_dict_nograde']}

In [10]:
output = produce_sequence(keyNum = '337509', X_dict = X_dict_merge, n_return = 1)

After Beamer search, the most possible hand sequence and the successRate:
[0, 0, 1, 2, 3, 4, 5] ['LH', 'RH', 'LH', 'RH', 'LH', 'RH', 'LH'] 229.17442900816357
[13.966610165238235, 3.771272258651297, 3.462239147685987, 4.2965295841566284, 1.0226156288345547, 9.826095003804213, 40.82321619767785]


In [15]:
output[0].successScoreSequence

[13.966610165238235,
 3.771272258651297,
 3.462239147685987,
 4.2965295841566284,
 1.0226156288345547,
 9.826095003804213,
 40.82321619767785]

In [22]:
X_dict_merge['337509'][6:8, output[0].handSequence]

array([[10., 10.,  5.,  8.,  5.,  3.,  1.],
       [ 4.,  4.,  6., 10., 13., 16., 17.]])

In [29]:
(np.array(output[0].handOperator) == 'LH')*(-1) + (np.array(output[0].handOperator) == 'RH')*1

array([-1,  1, -1,  1, -1,  1, -1])

In [30]:
np.vstack([X_dict_merge['337509'][6:8, output[0].handSequence], 
          (np.array(output[0].handOperator) == 'LH')*(-1) + (np.array(output[0].handOperator) == 'RH')*1, 
          output[0].successScoreSequence])

array([[10.        , 10.        ,  5.        ,  8.        ,  5.        ,
         3.        ,  1.        ],
       [ 4.        ,  4.        ,  6.        , 10.        , 13.        ,
        16.        , 17.        ],
       [-1.        ,  1.        , -1.        ,  1.        , -1.        ,
         1.        , -1.        ],
       [13.96661017,  3.77127226,  3.46223915,  4.29652958,  1.02261563,
         9.826095  , 40.8232162 ]])

In [45]:
def generate_organized_sequence_data(raw_data, save_path):
    """
    Input:
    - raw_data: the raw data that is scraped from MoonBoard
    - delta_xy_mode: 
      if set False(default), the x_vector compiled will be of shape (10, n_holds) (6 hold features, x, y, is_start, is_end)
      if set True, the x_vector compiled will be of shape (14, n_holds) (6 hold features, x, y, dx1, dy1, dx2, dy2, is_start, is_end)
    
    Classify and process the raw data into 2 dictionaries:
    - X_dict
    - Y_dict
    """
    X_dict_seq = {}
    list_fail = []
    for key, item in raw_data.items():
        try:
            output = produce_sequence(keyNum = key, X_dict = raw_data, n_return = 1)
            result = np.vstack([
                raw_data[key][6:8, output[0].handSequence], 
                (np.array(output[0].handOperator) == 'LH')*(-1) + (np.array(output[0].handOperator) == 'RH')*1, 
                output[0].successScoreSequence])
            X_dict_seq[key] = result
        except:
            print('data with key %s contains error' %key)
            list_fail.append(key)
        
        save_pickle(X_dict_seq, save_path)

    final_output = {'X_dict_seq': X_dict_seq,
              'list_fail': list_fail}
    
    save_pickle(X_dict_seq, save_path)
    print('result saved.')
    return final_output

In [None]:
save_path_seq = cwd + '/processed_data_seq'
output_seq = generate_organized_sequence_data(raw_data = X_dict_merge, save_path = save_path_seq)

After Beamer search, the most possible hand sequence and the successRate:
[0, 0, 1, 2, 3, 4, 5] ['LH', 'RH', 'LH', 'RH', 'LH', 'RH', 'LH'] 229.17442900816357
[13.966610165238235, 3.771272258651297, 3.462239147685987, 4.2965295841566284, 1.0226156288345547, 9.826095003804213, 40.82321619767785]
After Beamer search, the most possible hand sequence and the successRate:
[0, 0, 1, 2, 3, 4, 5] ['LH', 'RH', 'LH', 'RH', 'RH', 'LH', 'RH'] 25.662714212807135
[17.769336928223957, 8.89216531778502, 1.8701731248530076, 0.08932942698730434, 5.863121552017952, 1.4618647551633823, 8.585814486631532]
After Beamer search, the most possible hand sequence and the successRate:
[0, 1, 3, 4, 5, 6] ['LH', 'RH', 'LH', 'RH', 'LH', 'RH'] 515.7494242864443
[13.966610165238235, 0.2998804229301691, 8.806838882687279, 22.23281394481997, 12.580977395651514, 25.78157891381218]
After Beamer search, the most possible hand sequence and the successRate:
[0, 0, 2, 3, 4] ['LH', 'RH', 'LH', 'RH', 'LH'] 121.18568122292079
[17

In [49]:
len(output_seq['X_dict_seq'])

30557

In [50]:
save_path_seq

'/Users/jrchang612/MoonBoardRNN/preprocessing/processed_data_seq'

In [51]:
save_pickle(output_seq, save_path_seq)