# Logistic Regression Multiple Features

## Normalisasi Data

In [166]:
import "github.com/go-gota/gota/dataframe"

In [167]:
func getDataframe(file string) (dataframe.DataFrame, error) {
    var df dataframe.DataFrame 
    f, err := os.Open(file)
    if err != nil {
        return df, err
    }
    defer f.Close()

    return dataframe.ReadCSV(f), nil
}

In [171]:
%%
df, err := getDataframe("../data/diabetes2.csv")
if err != nil {
    fmt.Println(err)
    return
}

fmt.Println(df)
fmt.Println(df.Select([]int{0,1,2,3,4}).Describe())
fmt.Println(df.Select([]int{5,6,7}).Describe())

[768x9] DataFrame

    Pregnancies Glucose BloodPressure SkinThickness Insulin BMI       ...
 0: 6           148     72            35            0       33.600000 ...
 1: 1           85      66            29            0       26.600000 ...
 2: 8           183     64            0             0       23.300000 ...
 3: 1           89      66            23            94      28.100000 ...
 4: 0           137     40            35            168     43.100000 ...
 5: 5           116     74            0             0       25.600000 ...
 6: 3           78      50            32            88      31.000000 ...
 7: 10          115     0             0             0       35.300000 ...
 8: 2           197     70            45            543     30.500000 ...
 9: 8           125     96            0             0       0.000000  ...
    ...         ...     ...           ...           ...     ...       ...
    <int>       <int>   <int>         <int>         <int>   <float>   ...

Not Showing: Diabe

In [197]:
%%
minMax := []map[string]float64{
    map[string]float64{ "min" : 0.0, "max" : 17.0 },
    map[string]float64{ "min" : 0.0, "max" : 199.0 },
    map[string]float64{ "min" : 0.0, "max" : 122.0 },
    map[string]float64{ "min" : 0.0, "max" : 99.0 },
    map[string]float64{ "min" : 0.0, "max" : 846.0 },
    map[string]float64{ "min" : 0.0, "max" : 67.1 },
    map[string]float64{ "min" : 0.078000, "max" : 2.42 },
    map[string]float64{ "min" : 21.0, "max" : 81.0 },
} 

// Open the loan dataset file.
f, err := os.Open("../data/diabetes2.csv")
if err != nil {
    log.Fatal(err)
}
defer f.Close()

// Create a new CSV reader reading from the opened file.
reader := csv.NewReader(f)
reader.FieldsPerRecord = 9

// Read in all of the CSV records
rawCSVData, err := reader.ReadAll()
if err != nil {
    log.Fatal(err)
}

// Create the output file.
f, err = os.Create("01-clean_diabetes.csv")
if err != nil {
    log.Fatal(err)
}
defer f.Close()

// Create a CSV writer.
w := csv.NewWriter(f)

// Sequentially move the rows writing out the parsed values.
for idx, record := range rawCSVData {

    // Skip the header row.
    if idx == 0 {
        // Write the header to the output file.
        if err := w.Write(record); err != nil {
            log.Fatal(err)
        }
        continue
    }

    // Initialize a slice to hold our parsed values.
    outRecord := make([]string, 9)

    for i, v := range record {
        if i == 8 {
            outRecord[i] = v
            continue
        }
        value, err := strconv.ParseFloat(v, 64)
        if err != nil {
            log.Fatal(err)
        }
        outRecord[i] = strconv.FormatFloat((value-minMax[i]["min"])/(minMax[i]["max"]-minMax[i]["min"]), 'f', 4, 64)
    }
    
    // Write the record to the output file.
    if err := w.Write(outRecord); err != nil {
        log.Fatal(err)
    }
}

// Write any buffered data to the underlying writer (standard output).
w.Flush()

if err := w.Error(); err != nil {
    log.Fatal(err)
} 

## Membagi Data menjadi Training dan Test Set

In [190]:
%%
df, err := getDataframe("01-clean_diabetes.csv")
if err != nil {
  fmt.Println(err)
  return
}

trainingNum := (4 * df.Nrow()) / 5

// Shuffling data
allIdx := make([]int, df.Nrow())
for i := range allIdx {
  allIdx[i] = i
}
rand.Shuffle(len(allIdx), func(i, j int) { allIdx[i], allIdx[j] = allIdx[j], allIdx[i] })

// Gunakan indeks acak untuk split
trainingIdx := allIdx[:trainingNum]
testIdx := allIdx[trainingNum:]

trainingDF := df.Subset(trainingIdx)
testDF := df.Subset(testIdx)

setMap := map[int]dataframe.DataFrame{
    0: trainingDF,
    1: testDF,
}

for idx, setName := range []string{"01-train-diabetes.csv", "01-test-diabetes.csv"} {
    f, err := os.Create(setName)
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    w := bufio.NewWriter(f)
    if err := setMap[idx].WriteCSV(w); err != nil {
        log.Fatal(err)
    }
}

## Menggunakan goml untuk Logistic Regression

Seperti yang disebutkan di awal tulisan ini, ada banyak library di golang yang mengimplementasikan logictic regression. Menurut saya, yang paling populer adalah GoLearn dan goml. Kita akan menggunakan salah satunya, yaitu goml.

In [191]:
import (
    "github.com/cdipaolo/goml/base"
    "github.com/cdipaolo/goml/linear"
    "gonum.org/v1/plot" 
    "gonum.org/v1/plot/plotter" 
    "gonum.org/v1/plot/plotutil" 
    "github.com/janpfeifer/gonb/gonbui"
)    

In [192]:
type ConfusionMatrix struct {
	positive      int
	negative      int
	truePositive  int
	trueNegative  int
	falsePositive int
	falseNegative int
	recall        float64
	precision     float64
	accuracy      float64
}

In [193]:
func (cm ConfusionMatrix) String() string {
	return fmt.Sprintf("\tPositives: %d\n\tNegatives: %d\n\tTrue Positives: %d\n\tTrue Negatives: %d\n\tFalse Positives: %d\n\tFalse Negatives: %d\n\n\tRecall: %.2f\n\tPrecision: %.2f\n\tAccuracy: %.2f\n",
		cm.positive, cm.negative, cm.truePositive, cm.trueNegative, cm.falsePositive, cm.falseNegative, cm.recall, cm.precision, cm.accuracy)
}

In [194]:
func tryValues(learningRate float64, regularization float64, iterations int, decisionBoundary float64, xTrain, xTest [][]float64, yTrain, yTest []float64) (*ConfusionMatrix, *linear.Logistic, error) {
	cm := ConfusionMatrix{}
	for _, y := range yTest {
		if y == 1.0 {
			cm.positive++
		}
		if y == 0.0 {
			cm.negative++
		}
	}

	// Instantiate and Learn the Model
	model := linear.NewLogistic(base.BatchGA, learningRate, regularization, iterations, xTrain, yTrain)
	model.Output = ioutil.Discard
	err := model.Learn()
	if err != nil {
		return nil, nil, err
	}

	// Evaluate the Model on the Test data
	for i := range xTest {
		prediction, err := model.Predict(xTest[i])
		if err != nil {
			return nil, nil, err
		}
		y := int(yTest[i])
		positive := prediction[0] >= decisionBoundary

		if y == 1 && positive {
			cm.truePositive++
		}
		if y == 1 && !positive {
			cm.falseNegative++
		}
		if y == 0 && positive {
			cm.falsePositive++
		}
		if y == 0 && !positive {
			cm.trueNegative++
		}
	}

	// Calculate Evaluation Metrics
	cm.recall = float64(cm.truePositive) / float64(cm.positive)
	cm.precision = float64(cm.truePositive) / (float64(cm.truePositive) + float64(cm.falsePositive))
	cm.accuracy = float64(float64(cm.truePositive)+float64(cm.trueNegative)) / float64(float64(cm.positive)+float64(cm.negative))
	return &cm, model, nil
}

In [198]:
%%
xTrain, yTrain, err := base.LoadDataFromCSV("./01-train-diabetes.csv")
if err != nil {
    fmt.Println(err)
    return 
}

xTest, yTest, err := base.LoadDataFromCSV("./01-test-diabetes.csv")
if err != nil {
    fmt.Println(err)
    return
}

/*var maxAccuracy float64
var maxAccuracyCM *ConfusionMatrix
var maxAccuracyDb float64
var maxAccuracyIter int
var maxAccuracyModel *linear.Logistic

//Try different parameters to get the best model
for iter := 100; iter < 3000; iter += 500 {
    for db := 0.05; db < 1.0; db += 0.01 {
        cm, model, err := tryValues(0.0001, 0.0, iter, db, xTrain, xTest, yTrain, yTest)
        if err != nil {
            fmt.Println(err)
            return
        }
        if cm.accuracy > maxAccuracy {
            maxAccuracy = cm.accuracy
            maxAccuracyCM = cm
            maxAccuracyDb = db
            maxAccuracyModel = model
            maxAccuracyIter = iter
        }
    }
}

fmt.Printf("Maximum accuracy: %.2f\n\n", maxAccuracy)
fmt.Printf("with Model: %s\n\n", maxAccuracyModel)
fmt.Printf("with Confusion Matrix:\n%s\n\n", maxAccuracyCM)
fmt.Printf("with Decision Boundary: %.2f\n", maxAccuracyDb)
fmt.Printf("with Num Iterations: %d\n", maxAccuracyIter)
*/

db := float64(0.5)
cm, model, err := tryValues(0.0001, 0.0, 1000, db, xTrain, xTest, yTrain, yTest)
if err != nil {
    fmt.Println(err)
    return
}

fmt.Printf("with Model: %s\n\n", model)
fmt.Printf("with Confusion Matrix:\n%s\n\n", cm)

with Model: h(θ,x) = 1 / (1 + exp(-θx))
θx = -1.533 + 0.60099(x[1]) + 1.02816(x[2]) + -0.48679(x[3]) + -0.06320(x[4]) + 0.32023(x[5]) + 0.32382(x[6]) + 0.41202(x[7]) + 0.72926(x[8])

with Confusion Matrix:
	Positives: 67
	Negatives: 87
	True Positives: 4
	True Negatives: 86
	False Positives: 1
	False Negatives: 63

	Recall: 0.06
	Precision: 0.80
	Accuracy: 0.58


