In [None]:
import joblib
import pyarrow.parquet as pq
import fastparquet
import pandas as pd

In [None]:
# Original code from https://programmer.ink/think/5d45867ef3982.html

## Load datamodel

In [None]:
data_model = pd.read_excel('/home/justin/Data/BigDataCallTracesDataModel-RC7-MLfields.xlsx', header=[0,1])

In [None]:
print(data_model.head())

### Load data

In [None]:
from pathlib import Path
# parquet_data_path = Path('/home/justin/Code/ran_arcd/project/data/interim/tigo_parquet')
parquet_data_path = Path('/home/justin/Data/cdr_full.gz.parquet')
df = pd.read_parquet(parquet_data_path)

### Select relevant categories

In [None]:
def check_unique_category(df, database_name, unique_column):
    # Unique to SyBase UMTS
    try: 
        df[unique_column]
        return True
    except:
        return False

In [None]:
"""
Legacy
"""
# Unique to SyBase UMTS
database_name = 'SyBase UMTS'
unique_column = 'initial_lac'
sybase_umts_tup = (database_name, unique_column)

# Unique to SyBase GSM
database_name = 'SyBase GSM'
unique_column = 'start_cell_id_lac'
sybase_umts_gsm = (database_name, unique_column)

# Unique to SyBase LTE
database_name = 'SyBase LTE'
unique_column = 'enb_ue_s1ap_id'
sybase_umts_lte = (database_name, unique_column)


"""
BigData Model
"""
# Unique to BigData Model
database_name = 'Avro Schema Output Name'
unique_column = 'interface'
avro_out = (database_name, unique_column)

# List of database names / unique column pairs
# [(database_name, unique_column), ...]
database_style_lst = [avro_out]

In [None]:
"""
Check whether valid database type
"""
result_list = []
for database_marker_tup in database_style_lst:
    result_list.append(check_unique_category(df, database_marker_tup[0], database_marker_tup[1]))
    
database_check_results = [(tup[0], tup[1], result_list[i]) for i, tup in enumerate(database_style_lst)]

In [None]:
print(database_check_results)

In [None]:
print(data_model.columns.values)

In [None]:
"""
Get column names to keep as inputs
"""
feature_names = ('BIGDATA MODEL', 'Output Avro Schema Name')
features_of_interest = ('Nex - TSNG (5.2)', 'Relevant for ML?')
targets_column = ('Nex - TSNG (5.2)', 'Label?')

# data_model[features_of_interest] = data_model[features_of_interest] == 'Y'

# print(features_of_interest)
# print(data_model.head())
# database_check_results

In [None]:
features = data_model[data_model[features_of_interest] == 'Y'][feature_names]
targets = data_model[data_model[targets_column] == 'Y'][feature_names]

In [None]:
print(features)

In [None]:
print(targets)

### Isolate relevant columns / targets

In [None]:
df = df[features]

In [None]:
print(df[targets])

## Get file loader

In [None]:
### Parquet loader

In [None]:
from pathlib import Path
# parquet_data_path = Path('/home/justin/Code/ran_arcd/project/data/interim/tigo_parquet')

In [None]:
pq_dataset = pq.ParquetDataset(parquet_data_path)

In [None]:
table = pq_dataset.read()

In [None]:
df.describe()

In [None]:
df.columns.values

In [None]:
print(table)

In [None]:
# table.

In [None]:
table.column_names

### Petastorm

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [None]:
### Spark loader

In [None]:
# import pyspark as spark
# df_pd = spark.read.parquet("...").sample(0.1, seed=42).toPandas()

In [None]:
path_base = "/home/justin/Code/ran_arcd/project/data/interim/"
checkpoint_path = path_base + "checkpoint"
table_path_base = path_base + "tigo_parquet/part.*.parquet"
table_path_base_file = table_path_base
print(table_path_base_file)

In [None]:
train_size = spark.read.parquet(table_path_base_file).count()
# test_size = spark.read.parquet(table_path_base_file + "test").count()

In [None]:
spark_df = spark.read.parquet(table_path_base_file)

In [None]:
print(spark_df.rdd.getNumPartitions())

In [None]:
partition_count = 10000
spark_df = spark_df.repartition(partition_count) # resilient distributed dataset

In [None]:
print(spark_df.rdd.getNumPartitions())

In [None]:
iterator = spark_df.rdd.toLocalIterator()

In [None]:
print(next(iterator))

In [None]:
print(type(result))

In [None]:
print(type(result.rdd))

In [None]:
print(result.describe())

In [None]:
pd_df = spark_df.select('*').toPandas()

In [None]:
import os

# Workaround for Arrow issue:
underscore_files = [f for f in (os.listdir(table_path_base + "train") + 
    os.listdir(table_path_base + "test")) if f.startswith("_")]
pq.EXCLUDED_PARQUET_PATHS.update(underscore_files)

In [None]:
# img_size = 299

def transform_reader(reader, batch_size):
    
    def transform_input(x):
        img_bytes = tf.reshape(decode_raw(x.image, tf.uint8), (-1,img_size,img_size,3))
        inputs = preprocess_input(tf.cast(img_bytes, tf.float32))
        outputs = x.label - 1
        return (inputs, outputs)
    
    return make_petastorm_dataset(reader).map(transform_input).\
        apply(unbatch()).shuffle(400, seed=42).\
        batch(batch_size, drop_remainder=True)

In [None]:
# https://docs.azuredatabricks.net/_static/notebooks/deep-learning/petastorm.html
from petastorm import make_batch_reader
from petastorm.tf_utils import make_petastorm_dataset

In [None]:
"""
Load using pyarrow 
"""
with make_batch_reader(petastorm_dataset_url, num_epochs=100) as reader:
    dataset = make_petastorm_dataset(reader).map(lambda x: (tf.reshape(x.features, [-1, 28, 28, 1]), tf.one_hot(x.label, 10)))
    model = get_model()
    optimizer = keras.optimizers.Adadelta()
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
    model.fit(dataset, steps_per_epoch=10, epochs=10)


## Mini-batch iterator

In [None]:
def iter_minibatches(minibatch_size=1000):
    '''
    Iterator
    Given a file stream (such as a large file), output the minibatch_size line at a time, and select the default line of 1k
    Convert output to numpy output, return X, y
    '''
    X = []
    y = []
    cur_line_num = 0

    train_data, train_label, train_weight, test_data, test_label, test_file = load_data()
    train_data, train_label = shuffle(train_data, train_label, random_state=0)  # random_state=0 is used to record the scrambling position to ensure that each scrambling position remains unchanged.
    print(type(train_label), train_label)

    for data_x, label_y in zip(train_data, train_label):
        X.append(data_x)
        y.append(label_y)

        cur_line_num += 1
        if cur_line_num >= minibatch_size:
            X, y = np.array(X), np.array(y)  # Converting data to numpy array type and returning
            yield X, y
            X, y = [], []
            cur_line_num = 0

### Lightgbm (LGB) Incremental Training Process

In [None]:
def lightgbmTest():
    import lightgbm as lgb
    # The first step is to initialize the model as None and set the model parameters.
    gbm = None
    params = {
        'task': 'train',
        'application': 'regression',  # objective function
        'boosting_type': 'gbdt',  # Setting Upgrade Types
        'learning_rate': 0.01,  # Learning rate
        'num_leaves': 50,  # Number of leaf nodes
        'tree_learner': 'serial',
        'min_data_in_leaf': 100,
        'metric': ['l1', 'l2', 'rmse'],  # l1:mae, l2:mse  # Evaluation function
        'max_bin': 255,
        'num_trees': 300
    }

    # The second step is streaming data (100,000 at a time)
    minibatch_train_iterators = iter_minibatches(minibatch_size=10000)

    for i, (X_, y_) in enumerate(minibatch_train_iterators):
        # Create lgb datasets
        # y_ = list(map(float, y_))  # Convert numpy.ndarray to list

        X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.1, random_state=0)
        y_train = y_train.ravel()
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

        # Step 3: Incremental Training Model
        # Emphasis is laid on incremental training through init_model and keep_training_booster parameters.
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1000,
                        valid_sets=lgb_eval,
                        init_model=gbm,  # If gbm is not None, then it is on the basis of the last training.
                        # feature_name=x_cols,
                        early_stopping_rounds=10,
                        verbose_eval=False,
                        keep_training_booster=True)  # Incremental training

        print("{} time".format(i))  # Current Number
        # Output Model Assessment Score
        score_train = dict([(s[1], s[2]) for s in gbm.eval_train()])
        print('The score of the current model in the training set is: mae=%.4f, mse=%.4f, rmse=%.4f'
              % (score_train['l1'], score_train['l2'], score_train['rmse']))

    return gbm

## Lightgbm (LGB) Call Procedure and Save Training Result Model

In [None]:
'''lightgbm Incremental training'''
print('lightgbm Incremental training')
train_data, train_label, train_weight, test_data, test_label, test_file = load_data()
print(train_label.shape,train_data.shape)
train_X, test_X, train_Y, test_Y = train_test_split(train_data, train_label, test_size=0.1, random_state=0)
# train_X, train_Y = shuffle(train_data, train_label, random_state=0)  # random_state=0 is used to record the scrambling position to ensure that each scrambling position remains unchanged.

gbm = lightgbmTest()
pred_Y = gbm.predict(test_X)
print('compute_loss:{}'.format(compute_loss(test_Y, pred_Y)))

# gbm.save_model('lightgbmtest.model')
# Model Storage
joblib.dump(gbm, 'loan_model.pkl')
# Model Loading
gbm = joblib.load('loan_model.pkl')