# ETL
## Development Datasets
Create development datasets consisting of the top 'nrows' observations from each file

In [1]:
# Imports
import pandas as pd
import re


In [2]:
# Constants 
NROWS = 100
# Training Sets
FILEPATH_TRAIN_CORE_RAW = "data/raw/sample_skeleton_train.csv"
FILEPATH_TRAIN_CORE_DEVELOPMENT = "data/development/sample_skeleton_train.csv"
FILEPATH_TRAIN_COMMON_FEATURES_RAW = "data/raw/common_features_train.csv"
FILEPATH_TRAIN_COMMON_FEATURES_DEVELOPMENT = "data/development/common_features_train.csv"
# Test Sets
FILEPATH_TEST_CORE_RAW = "data/raw/sample_skeleton_test.csv"
FILEPATH_TEST_CORE_DEVELOPMENT = "data/development/sample_skeleton_test.csv"
FILEPATH_TEST_COMMON_FEATURES_RAW = "data/raw/common_features_test.csv"
FILEPATH_TEST_COMMON_FEATURES_DEVELOPMENT = "data/development/common_features_test.csv"

## Core Datasets
### Training Set

In [3]:
# Extract samples from core training set
df = pd.read_csv(FILEPATH_TRAIN_CORE_DEVELOPMENT,index_col=None, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,bacff91692951881,9,21090522181.021090645531.021090934451....
1,2,0,0,bacff91692951881,10,21091097321.021090462841.021090990351....
2,3,1,0,bacff91692951881,20,21090897311.021090475601.050995117692....
3,4,0,0,bacff91692951881,13,30193516651.021090503641.021090833881....
4,5,0,0,bacff91692951881,9,20549456631.030193516651.021691721791....


### Test Set

In [4]:
# Extract samples from core test set
df = pd.read_csv(FILEPATH_TEST_CORE_DEVELOPMENT,index_col=None, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,23bd0f75de327c60,14,21691810781.030193516651.020555871431....
1,2,0,0,23bd0f75de327c60,15,20556627321.020683168931.020789873281....
2,3,0,0,23bd0f75de327c60,12,20683154051.020565395121.030193516651....
3,4,0,0,23bd0f75de327c60,11,50996861712.9957321090689921.0207880102...
4,5,0,0,543b0cd53c7d5858,11,20683170931.050893553232.63906210902041...


## Common Datasets
### Training Set

In [5]:
# Extract samples from common training set
df = pd.read_csv(FILEPATH_TRAIN_COMMON_FEATURES_DEVELOPMENT,index_col=None, header=None)
df.head()

Unnamed: 0,0,1,2
0,84dceed2e3a667f8,343,101313191.012534387741.012634387791.0...
1,0000350f0c2121e7,811,127_1437162241.94591127_1435146270.69315...
2,000091a89d1867ab,7,12534387731.012434387691.012234387611....
3,0001a4114b0ae8bf,231,150_1439166842.3979150_1439407981.070561...
4,0001def19d7cb335,964,150_1439091500.84715150_1439330134.44265...


### Test Set

In [6]:
# Extract samples from common features test set
df = pd.read_csv(FILEPATH_TEST_COMMON_FEATURES_DEVELOPMENT,index_col=None, header=None)
df.head()

Unnamed: 0,0,1,2
0,810d5366057b3f58,1025,1014127971.012534387721.012634387781.0...
1,0001970d9ebf72cf,126,150_1438896862.03693150_1438840840.15444...
2,0010d0b9633bb5b0,250,12634387771.012734387821.012838648861....
3,0012aad1f55312b6,170,12734387821.012534387721.012434387681....
4,0013e5c24e8dd3a6,617,150_1438743040.15444150_1439005790.98095...


In [4]:
feature_string = df.head(1)[5].values[0]
features = re.split("\x01",feature_string)
feature_names = []
feature_ids = []
feature_values = []
for feature in features:
    name, id, value = re.split("\x01|\x02|\x03",feature)
    feature_names.append(name)
    feature_ids.append(int(id))
    feature_values.append(float(value))

print(feature_names)
print(feature_ids)
print(feature_values)


    


['210', '210', '210', '216', '301', '205', '206', '207', '508']
[9052218, 9064553, 9093445, 9154780, 9351665, 4186222, 8316799, 8416205, 9355039]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.69315]


In [5]:
# Create downsampled common features training set 
df = pd.read_csv(FILEPATH_TRAIN_COMMON_FEATURES_RAW,nrows=10000, index_col=False, header=None)
df.to_csv(FILEPATH_TRAIN_COMMON_FEATURES_DEVELOPMENT,index=False, header=None)



In [6]:
df.head()

Unnamed: 0,0,1,2
0,84dceed2e3a667f8,343,101313191.012534387741.012634387791.0...
1,0000350f0c2121e7,811,127_1437162241.94591127_1435146270.69315...
2,000091a89d1867ab,7,12534387731.012434387691.012234387611....
3,0001a4114b0ae8bf,231,150_1439166842.3979150_1439407981.070561...
4,0001def19d7cb335,964,150_1439091500.84715150_1439330134.44265...


In [7]:
fp = "data/staged/impressions.csv"
df = pd.read_csv(fp, nrows=1000, index_col=False)
df.head()

Unnamed: 0,sample_id,click_label,conversion_label,num_core_features,common_features_index
0,0,1,2,4,3
1,1,0,0,9,bacff91692951881
2,2,0,0,10,bacff91692951881
3,3,1,0,20,bacff91692951881
4,4,0,0,13,bacff91692951881
