In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chess-games/chess_games.csv


In [2]:
import pandas as pd
import dask.dataframe as dd
import os

import warnings
warnings.filterwarnings(action="ignore")

In [3]:
%%writefile file.yaml
file_type: csv
dataset_name: ChessGames
file_name: chess_games
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - Event
    - White
    - Black
    - Result
    - UTCDate
    - UTCTime
    - WhiteElo
    - BlackElo
    - WhiteRatingDiff
    - BlackRatingDiff
    - ECO
    - Opening
    - TimeControl
    - Termination
    - AN

Overwriting file.yaml


In [4]:
%%writefile testutility.py
import logging
import subprocess
import yaml
import datetime 
import gc
import re
import os

################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
   
    
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ','_')
    
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    
 
    df = df.reindex(sorted(df.columns), axis=1)
    
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
    
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')

Overwriting testutility.py


In [5]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

In [6]:
config_data

{'file_type': 'csv',
 'dataset_name': 'ChessGames',
 'file_name': 'chess_games',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['Event',
  'White',
  'Black',
  'Result',
  'UTCDate',
  'UTCTime',
  'WhiteElo',
  'BlackElo',
  'WhiteRatingDiff',
  'BlackRatingDiff',
  'ECO',
  'Opening',
  'TimeControl',
  'Termination',
  'AN']}

In [7]:
%%time
# read the file using config file
file_type = config_data['file_type']
source_file = "../input/chess-games/" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file, sep = config_data['inbound_delimiter'])

CPU times: user 54.1 s, sys: 5.44 s, total: 59.5 s
Wall time: 1min 30s


In [8]:
util.col_header_val(df, config_data)

column name and column length validation passed


In [9]:
%%time
df = pd.read_csv('../input/chess-games/chess_games.csv')
print(f'Number of Rows: {len(df)}, Number of Columns: {len(df.columns)}')
print(f"Size: {os.path.getsize('../input/chess-games/chess_games.csv')/10**9:.2f} GB")

Number of Rows: 6256184, Number of Columns: 15
Size: 4.38 GB
CPU times: user 53.9 s, sys: 5.03 s, total: 59 s
Wall time: 1min 6s


In [10]:
%%time
df = dd.read_csv('../input/chess-games/chess_games.csv')
print(f'Number of Rows: {len(df)}, Number of Columns: {len(df.columns)}')
print(f"Size: {os.path.getsize('../input/chess-games/chess_games.csv')/10**9:.2f} GB")

Number of Rows: 6256184, Number of Columns: 15
Size: 4.38 GB
CPU times: user 38.2 s, sys: 9.06 s, total: 47.3 s
Wall time: 36.7 s


In [11]:
df.columns

Index(['Event', 'White', 'Black', 'Result', 'UTCDate', 'UTCTime', 'WhiteElo',
       'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'ECO', 'Opening',
       'TimeControl', 'Termination', 'AN'],
      dtype='object')

In [12]:
import datetime
import csv
import gzip

# Write csv in gz format in pipe separated text file (|)
df.to_csv('chess_games.gz',
          sep='|',
          header=True,
          index=False,
          quoting=csv.QUOTE_ALL,
          compression='gzip',
          quotechar='"',
          doublequote=True,
          line_terminator='\n')

KeyboardInterrupt: 