### This file is to show an how we can convert a csv file into the tf record file, so that it can be used in the further process of pipeline. Here, we are going to see it manually but tfx provides methods to do it as well. But let's see it manually first

In [1]:
import os
import csv
import tensorflow as tf

In [2]:
csv_file_path = os.path.join(os.getcwd(),"data","data.csv")
os.path.exists(csv_file_path)    ## to check if file is present

True

In [3]:
csv_file_path

'C:\\Users\\atulkumarrai\\PycharmProjects\\Ineuron practice\\Ineuron_practice\\AIOps\\AIOps_Projects\\data\\data.csv'

In [7]:
pwd

'C:\\Users\\atulkumarrai\\PycharmProjects\\Ineuron practice\\Ineuron_practice\\AIOps\\AIOps_Projects'

In [8]:
## reading csv file

with open(csv_file_path) as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')     ## quotechar shows that string characters in csv are enclosed with ""
    for row in reader:
        print(row)
        break
        

OrderedDict([('pickup_community_area', '60'), ('fare', '27.05'), ('trip_start_month', '10'), ('trip_start_hour', '2'), ('trip_start_day', '3'), ('trip_start_timestamp', '1380593700'), ('pickup_latitude', '41.836150155'), ('pickup_longitude', '-87.648787952'), ('dropoff_latitude', ''), ('dropoff_longitude', ''), ('trip_miles', '12.6'), ('pickup_census_tract', ''), ('dropoff_census_tract', ''), ('payment_type', 'Cash'), ('company', 'Taxi Affiliation Services'), ('trip_seconds', '1380'), ('dropoff_community_area', ''), ('tips', '0.0')])


In [9]:
row["pickup_community_area"]



'60'

### Now we are able to read the csv contents, so we will create the th.writer because we need to write tf record file. We will create a separate folder for this

In [10]:
tf_record_dir = os.path.join(os.getcwd(),"tf_record_files")
tf_record_dir


'C:\\Users\\atulkumarrai\\PycharmProjects\\Ineuron practice\\Ineuron_practice\\AIOps\\AIOps_Projects\\tf_record_files'

In [11]:
### Now let's make a directory
os.makedirs(tf_record_dir, exist_ok=True)

In [12]:
tf_record_file_name = os.path.join(tf_record_dir,"data.tfrecord")   ## This is the name of the converted file with tfrecord extension

In [13]:
tf_record_writer = tf.io.TFRecordWriter(tf_record_file_name)   ## This will create an empty tfrecord file

### We will define 3 functions, for byte conversion, int conversion and float conversion, using tf.train module

In [19]:
def _byte_feature(value):
    value = value.encode()
    return tf.train.BytesList(value = [value])

In [15]:
def _int64_feature(value):
    return tf.train.Int64List(value = [value])

In [16]:
def _float_feature(value):
    return tf.train.FloatList(value=[value])

In [25]:
type(_byte_feature(value="Atul Rai"))

tensorflow.core.example.feature_pb2.BytesList

In [26]:
_byte_feature(value="Atul Rai")

value: "Atul Rai"

In [22]:
_int64_feature(value=30)

value: 30

In [24]:
type(_float_feature(value=34.4))

tensorflow.core.example.feature_pb2.FloatList

### Just to understand, the tf record file has 3 contents, 1)feature 2) features and 3)records. 1) Feature is the each column  2) features is the set of entire columns and 3) example means rows.  Tf record file only contains three types, byte for strings, int and float. Now let's modify our reading the csv file function

In [31]:
example = tf.train.Example(features = tf.train.Features(feature = {"name": tf.train.Feature(bytes_list = _byte_feature("Atul Rai"))}))

In [29]:
example    ##This is one row(example) in the tf record file

features {
  feature {
    key: "name"
    value {
      bytes_list {
        value: "Atul Rai"
      }
    }
  }
}

In [32]:
example = tf.train.Example(features = tf.train.Features(feature = {"name": tf.train.Feature(bytes_list = _byte_feature("Atul Rai")), 
                                                                  "Age": tf.train.Feature(int64_list = _int64_feature(24))}))

In [34]:
example   ## It shows the two rows

features {
  feature {
    key: "Age"
    value {
      int64_list {
        value: 24
      }
    }
  }
  feature {
    key: "name"
    value {
      bytes_list {
        value: "Atul Rai"
      }
    }
  }
}

In [35]:
example = tf.train.Example(features = tf.train.Features(feature = {"Name": tf.train.Feature(bytes_list = _byte_feature("Atul Rai")), 
                                                                  "Age": tf.train.Feature(int64_list = _int64_feature(24)),
                                                                  "Marks": tf.train.Feature(float_list = _float_feature(78.8))}))

In [36]:
example

features {
  feature {
    key: "Age"
    value {
      int64_list {
        value: 24
      }
    }
  }
  feature {
    key: "Marks"
    value {
      float_list {
        value: 78.80000305175781
      }
    }
  }
  feature {
    key: "Name"
    value {
      bytes_list {
        value: "Atul Rai"
      }
    }
  }
}

In [38]:
### Now we will serialize the example value
example.SerializeToString()

b'\n7\n\x14\n\x04Name\x12\x0c\n\n\n\x08Atul Rai\n\x0c\n\x03Age\x12\x05\x1a\x03\n\x01\x18\n\x11\n\x05Marks\x12\x08\x12\x06\n\x04\x9a\x99\x9dB'

#### Above byte code is how the tf record files store the data and that's why it is platform independent

In [39]:
## Let's modify our functions below so that we can use them properly
def _byte_feature(value):
    value = value.encode()
    return tf.train.Feature(bytes_list = tf.train.BytesList(value=[value]))
                            
def _int64_feature(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))

def _float_feature(value):
    return tf.train.Feature(float_list = tf.train.FloatList(value=[value]))

In [40]:
example_1 = tf.train.Example(features = tf.train.Features(feature = {"Name": _byte_feature("Atul Rai"), 
                                                                  "Age": _int64_feature(24),
                                                                  "Marks": _float_feature(78.8)}))

In [41]:
example_1

features {
  feature {
    key: "Age"
    value {
      int64_list {
        value: 24
      }
    }
  }
  feature {
    key: "Marks"
    value {
      float_list {
        value: 78.80000305175781
      }
    }
  }
  feature {
    key: "Name"
    value {
      bytes_list {
        value: "Atul Rai"
      }
    }
  }
}

## Now let's create the tf record file using the same above method for our csv file which we have read already

In [52]:
## reading csv file

with open(csv_file_path) as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')     ## quotechar shows that string characters in csv are enclosed with ""
    for row in reader:
        feature = {
        "pickup_community_area":_byte_feature(row["pickup_community_area"]),
        "fare": _float_feature(float(row["fare"])),                     ## Since, all the values are given as string hence, converting them to float and int
        "trip_start_month": _int64_feature(int(row["trip_start_month"])),
        "trip_start_day": _int64_feature(int(row["trip_start_day"]))
                    }
        
        features = tf.train.Features(feature = feature)
        example = tf.train.Example(features = features)
        
        ## Now we will write into the file using serilizartion
        tf_record_writer.write(example.SerializeToString())
        
tf_record_writer.close()


In [56]:
example ## This contains the example of our csv

features {
  feature {
    key: "fare"
    value {
      float_list {
        value: 5.650000095367432
      }
    }
  }
  feature {
    key: "pickup_community_area"
    value {
      bytes_list {
        value: "61"
      }
    }
  }
  feature {
    key: "trip_start_day"
    value {
      int64_list {
        value: 6
      }
    }
  }
  feature {
    key: "trip_start_month"
    value {
      int64_list {
        value: 4
      }
    }
  }
}

### We can't open the tf record file in our jupyter notebook, so we can check in our directory the tf record file will be of some considerable size. We checked and found that size of the file is more than 1.72 mb it means it has successfully converted the 4 columns into tf record file.

## Now this all we can perform using csvExampleGen class of tfx at one go which we are going to see on Testing.ipynb