# CSV Parsing and Data Manipulation Demo

In [48]:
#require "csv"
#use "dataframe.ml"

type dataframe = { header : string list; data : string list list; }
val loadfile : string -> dataframe = <fun>
val save_df : dataframe -> string -> unit = <fun>
val print_df : dataframe -> unit = <fun>
val get_encodings : string list -> (string * float) list = <fun>
val encode : string list -> float list = <fun>
val string_to_float : string list list -> float list list = <fun>
val cols_to_float : dataframe -> float list list = <fun>
val pre_process : dataframe -> dataframe = <fun>
val find_index : 'a -> 'a list -> int = <fun>
val indices_from_col_lst : dataframe -> string list -> int list = <fun>
val rename_cols_helper : dataframe -> int list -> string list -> dataframe =
  <fun>
val rename_cols : dataframe -> string list -> string list -> dataframe =
  <fun>
val rename_cols_i : dataframe -> int list -> string list -> dataframe = <fun>
val select_cols_helper : dataframe -> int list -> dataframe = <fun>
val select_cols : dataframe -> string list -> dataframe = <fun>
val select_cols_i : 

In [49]:
(* Loading a dataset *)

let incomes = loadfile "income.csv"

val incomes : dataframe =
  {header =
    ["age"; "education"; "educational-num"; "marital-status"; "occupation";
     "relationship"; "race"; "income"];
   data =
    [["25"; "38"; "28"; "44"; "18"; "34"; "29"; "63"; "24"; "55"; "65"];
     ["11th"; "HS-grad"; "Assoc-acdm"; "Some-college"; "Some-college";
      "10th"; "HS-grad"; "Prof-school"; "Some-college"; "7th-8th"; "HS-grad"];
     ["7"; "9"; "12"; "10"; "10"; "6"; "9"; "15"; "10"; "4"; "9"];
     ["Never-married"; "Married-civ-spouse"; "Married-civ-spouse";
      "Married-civ-spouse"; "Never-married"; "Never-married";
      "Never-married"; "Married-civ-spouse"; "Never-married";
      "Married-civ-spouse"; "Married-civ-spouse"];
     ["Machine-op-inspct"; "Farming-fishing"; "Protective-serv";
      "Machine-op-inspct"; "?"; "Other-service"; "?"; "Prof-specialty";
      "Other-service"; "Craft-repair"; "Machine-op-inspct"];
     ["Own-child"; "Husband"; "Husband"; "Husband"; "Own-child";
      "Not-in-family"; "Unmarried"; "Husb

Though the dataset has been loaded, the output above shows how it is internally stored and is hardly readable. To get a more readable output, use print_df

In [50]:
print_df incomes

age education    educational-num marital-status     occupation        relationship  race  income
25  11th         7               Never-married      Machine-op-inspct Own-child     Black <=50K
38  HS-grad      9               Married-civ-spouse Farming-fishing   Husband       White <=50K
28  Assoc-acdm   12              Married-civ-spouse Protective-serv   Husband       White >50K
44  Some-college 10              Married-civ-spouse Machine-op-inspct Husband       Black >50K
18  Some-college 10              Never-married      ?                 Own-child     White <=50K
34  10th         6               Never-married      Other-service     Not-in-family White <=50K
29  HS-grad      9               Never-married      ?                 Unmarried     Black <=50K
63  Prof-school  15              Married-civ-spouse Prof-specialty    Husband       White >50K
24  Some-college 10              Never-married      Other-service     Unmarried     White <=50K
55  7th-8th      4               Married-c

- : unit = ()


Suppose we want to remove the educational-num, marital_status and relationship columns to get a smaller dataset with hopefully more relevant features:

In [51]:
let selected_features = select_cols incomes ["age"; "education"; "occupation"; "race"; "income"]

val selected_features : dataframe =
  {header = ["age"; "education"; "occupation"; "race"; "income"];
   data =
    [["25"; "38"; "28"; "44"; "18"; "34"; "29"; "63"; "24"; "55"; "65"];
     ["11th"; "HS-grad"; "Assoc-acdm"; "Some-college"; "Some-college";
      "10th"; "HS-grad"; "Prof-school"; "Some-college"; "7th-8th"; "HS-grad"];
     ["Machine-op-inspct"; "Farming-fishing"; "Protective-serv";
      "Machine-op-inspct"; "?"; "Other-service"; "?"; "Prof-specialty";
      "Other-service"; "Craft-repair"; "Machine-op-inspct"];
     ["Black"; "White"; "White"; "Black"; "White"; "White"; "Black"; "White";
      "White"; "White"; "White"];
     ["<=50K"; "<=50K"; ">50K"; ">50K"; "<=50K"; "<=50K"; "<=50K"; ">50K";
      "<=50K"; "<=50K"; ">50K"]]}


In [52]:
print_df selected_features

age education    occupation        race  income
25  11th         Machine-op-inspct Black <=50K
38  HS-grad      Farming-fishing   White <=50K
28  Assoc-acdm   Protective-serv   White >50K
44  Some-college Machine-op-inspct Black >50K
18  Some-college ?                 White <=50K
34  10th         Other-service     White <=50K
29  HS-grad      ?                 Black <=50K
63  Prof-school  Prof-specialty    White >50K
24  Some-college Other-service     White <=50K
55  7th-8th      Craft-repair      White <=50K
65  HS-grad      Machine-op-inspct White >50K



- : unit = ()


This is also doable by providing the index of columns using the select_cols_i function:

In [53]:
let selected_features_i = select_cols_i incomes [0; 1; 4; 6; 7]

val selected_features_i : dataframe =
  {header = ["age"; "education"; "occupation"; "race"; "income"];
   data =
    [["25"; "38"; "28"; "44"; "18"; "34"; "29"; "63"; "24"; "55"; "65"];
     ["11th"; "HS-grad"; "Assoc-acdm"; "Some-college"; "Some-college";
      "10th"; "HS-grad"; "Prof-school"; "Some-college"; "7th-8th"; "HS-grad"];
     ["Machine-op-inspct"; "Farming-fishing"; "Protective-serv";
      "Machine-op-inspct"; "?"; "Other-service"; "?"; "Prof-specialty";
      "Other-service"; "Craft-repair"; "Machine-op-inspct"];
     ["Black"; "White"; "White"; "Black"; "White"; "White"; "Black"; "White";
      "White"; "White"; "White"];
     ["<=50K"; "<=50K"; ">50K"; ">50K"; "<=50K"; "<=50K"; "<=50K"; ">50K";
      "<=50K"; "<=50K"; ">50K"]]}


In [54]:
print_df selected_features_i

age education    occupation        race  income
25  11th         Machine-op-inspct Black <=50K
38  HS-grad      Farming-fishing   White <=50K
28  Assoc-acdm   Protective-serv   White >50K
44  Some-college Machine-op-inspct Black >50K
18  Some-college ?                 White <=50K
34  10th         Other-service     White <=50K
29  HS-grad      ?                 Black <=50K
63  Prof-school  Prof-specialty    White >50K
24  Some-college Other-service     White <=50K
55  7th-8th      Craft-repair      White <=50K
65  HS-grad      Machine-op-inspct White >50K



- : unit = ()


We see that in the 'job' column, 2 values are missing and denoted with a '?'. Other datasets might represent missing data using 'NaN' or 'null'. Whatever the case, it would be great if we could fill in some data in that case instead of deleting the entire row (which might discard valuable data). In this case, it seems reasonable to fill it in with 'Other-service'. We can do that using update_cols (which, again, has an update_cols_i counterpart for updating using column indices).

In [56]:
let missing_data_filled = 
    update selected_features "occupation" (fun x -> x = "?") "Other-service"

val missing_data_filled : dataframe =
  {header = ["age"; "education"; "occupation"; "race"; "income"];
   data =
    [["25"; "38"; "28"; "44"; "18"; "34"; "29"; "63"; "24"; "55"; "65"];
     ["11th"; "HS-grad"; "Assoc-acdm"; "Some-college"; "Some-college";
      "10th"; "HS-grad"; "Prof-school"; "Some-college"; "7th-8th"; "HS-grad"];
     ["Machine-op-inspct"; "Farming-fishing"; "Protective-serv";
      "Machine-op-inspct"; "Other-service"; "Other-service"; "Other-service";
      "Prof-specialty"; "Other-service"; "Craft-repair"; "Machine-op-inspct"];
     ["Black"; "White"; "White"; "Black"; "White"; "White"; "Black"; "White";
      "White"; "White"; "White"];
     ["<=50K"; "<=50K"; ">50K"; ">50K"; "<=50K"; "<=50K"; "<=50K"; ">50K";
      "<=50K"; "<=50K"; ">50K"]]}


In [57]:
print_df missing_data_filled

age education    occupation        race  income
25  11th         Machine-op-inspct Black <=50K
38  HS-grad      Farming-fishing   White <=50K
28  Assoc-acdm   Protective-serv   White >50K
44  Some-college Machine-op-inspct Black >50K
18  Some-college Other-service     White <=50K
34  10th         Other-service     White <=50K
29  HS-grad      Other-service     Black <=50K
63  Prof-school  Prof-specialty    White >50K
24  Some-college Other-service     White <=50K
55  7th-8th      Craft-repair      White <=50K
65  HS-grad      Machine-op-inspct White >50K



- : unit = ()


Suppose we wish to remove data from people with a 7th-8th grade education. We can use filter for this (which also has a filter_i counterpart for getting a column using an index).

In [71]:
let mid_school_removed = filter missing_data_filled "education" (fun x -> x <> "7th-8th")

val mid_school_removed : dataframe =
  {header = ["age"; "education"; "occupation"; "race"; "income"];
   data =
    [["25"; "38"; "28"; "44"; "18"; "34"; "29"; "63"; "24"; "65"];
     ["11th"; "HS-grad"; "Assoc-acdm"; "Some-college"; "Some-college";
      "10th"; "HS-grad"; "Prof-school"; "Some-college"; "HS-grad"];
     ["Machine-op-inspct"; "Farming-fishing"; "Protective-serv";
      "Machine-op-inspct"; "Other-service"; "Other-service"; "Other-service";
      "Prof-specialty"; "Other-service"; "Machine-op-inspct"];
     ["Black"; "White"; "White"; "Black"; "White"; "White"; "Black"; "White";
      "White"; "White"];
     ["<=50K"; "<=50K"; ">50K"; ">50K"; "<=50K"; "<=50K"; "<=50K"; ">50K";
      "<=50K"; ">50K"]]}


In [72]:
print_df mid_school_removed

age education    occupation        race  income
25  11th         Machine-op-inspct Black <=50K
38  HS-grad      Farming-fishing   White <=50K
28  Assoc-acdm   Protective-serv   White >50K
44  Some-college Machine-op-inspct Black >50K
18  Some-college Other-service     White <=50K
34  10th         Other-service     White <=50K
29  HS-grad      Other-service     Black <=50K
63  Prof-school  Prof-specialty    White >50K
24  Some-college Other-service     White <=50K
65  HS-grad      Machine-op-inspct White >50K



- : unit = ()


In [78]:
let x_train, x_test, y_train, y_test = 
    train_test_split mid_school_removed ["age"; "education"; "occupation"; "race"] "income" 0.5

val x_train : float list list =
  [[25.; 38.; 28.; 44.; 18.; 34.]; [2.; 4.; 3.; 6.; 6.; 1.];
   [2.; 1.; 5.; 2.; 3.; 3.]; [1.; 2.; 2.; 1.; 2.; 2.]]
val x_test : float list list =
  [[29.; 63.; 24.; 65.]; [4.; 5.; 6.; 4.]; [3.; 4.; 3.; 2.];
   [1.; 2.; 2.; 2.]]
val y_train : float list = [1.; 1.; 2.; 2.; 1.; 1.]
val y_test : float list = [1.; 2.; 1.; 2.]
