In [None]:
# notebook full of useful codes and libraries for data science and analysis

In [None]:
import dask
import dask.dataframe as dd

In [None]:
# setting pandas df limits
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [None]:
# json to dataframe

with open('Assessor-Search-Results.json', 'r', encoding="utf-8") as json_file:
    json_work = json.load(json_file)

df = pd.json_normalize(json_work)

df = pd.DataFrame([y for x in df['results'].values.tolist() for y in x])

In [None]:
# Create a MSSQL Connection, import from SQL to pandas dataframe
import pyodbc
import pandas as pd

config = dict(server=   'SERVER',
              port=      PORT,
              database= 'DATABASE',
              username= 'USERNAME',
              password= 'PASSWORD')

cxn_str = ('SERVER={server},{port};' + 'DATABASE={database};' +
           'UID={username};' + 'PWD={password}')

cxn = pyodbc.connect(r'DRIVER={SQL Server Native Client 11.0};' + cxn_str.format(**config))

df = pd.read_sql("SELECT * FROM TABLE;", cxn)

In [None]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

To read data file incrementally using pandas, you have to use a parameter chunksize which specifies number of rows to read/write at a time.

In [None]:
incremental_dataframe = pd.read_csv(
    "train.csv", chunksize=100000)  # Number of lines to read.
# This method will return a sequential file reader (TextFileReader)
# reading 'chunksize' lines every time. To read file from
# starting again, you will have to call this method again.

Then you can train on your data incrementally using XGBoost¹ or LightGBM. For LightGBM you have to pass in a argument keep_training_booster=True to its .train method and three arguments to XGBoost's .train method.

In [None]:
# First one necessary for incremental learning:
lgb_params = {
    'keep_training_booster': True,
    'objective': 'regression',
    'verbosity': 100,
}
# First three are for incremental learning:
xgb_params = {
    'update': 'refresh',
    'process_type': 'update',
    'refresh_leaf': True,
    'silent': False,
}

On each step we will save our estimator and then pass it as an argument during next step.

In [None]:
# For saving regressor for next use.
lgb_estimator = None
xgb_estimator = None

for df in incremental_dataframe:
    df = preprocess(df)
  
    xtrain, ytrain, xvalid, yvalid = # Split data as you like
  
    lgb_estimator = lgb.train(lgb_params,
                         # Pass partially trained model:
                         init_model=lgb_estimator,
                         train_set=lgb.Dataset(xtrain, ytrain),
                         valid_sets=lgb.Dataset(xvalid, yvalid),
                         num_boost_round=10)

    xgb_model = xgb.train(xgb_params, 
                        dtrain=xgb.DMatrix(xtrain, ytrain),
                        evals=(xgb.DMatrix(xvalid, yvalid),"Valid"),
                        # Pass partially trained model:
                        xgb_model = xgb_estimator)

    del df, xtrain, ytrain, xvalid, yvalid
    gc.collect()

CatBoost's incremental learning method is in progress.²
To speed things up a bit more and if your chunks a still sufficiently big, you can parallelize your preprocessing method using Python's multiprocessing library functions like this:

In [None]:
n_jobs = 4
for df in incremental_dataframe:
    p = Pool(n_jobs)
    f_ = p.map(preprocess, np.array_split(df, n_jobs))
    f_ = pd.concat(f_, axis=0, ignore_index=True)
    p.close()
    p.join()

    # And then your model training ...

For an introduction on Parallel programming in Python read my post here.

https://towardsdatascience.com/speed-up-your-algorithms-part-1-pytorch-56d8a4ae7051