# Homework 1 for Security Analytics 
## Author: Kandarp Khandwala - kkhandw1
---
### Decision Trees
- Data:

    * Homework2.csv contains information of static analysis of 99 binary files. 50 files are benign/good and 49 are malicious/bad.
    * The target variable is “label”; its value is either “good” or “bad”. All other columns are features.

- Task:

    * Build a classifier with decision tree. Use the the whole data set for both training and testing purpose. Submit your best model in terms of prediction accuracy.
    * What are the most 10 important features? Why?


In [1]:
import os
import numpy as np
import pandas as pd
import math
import matplotlib.patches as patches
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import datasets
from IPython.display import Image

%matplotlib inline

## Helper functions

In [2]:
# A function that gives a visual representation of the decision tree
def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    # Export our decision tree to graphviz format
    #dot_file = tree.export_graphviz(decision_tree.tree_, out_file='images/' + name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    #os.system("dot -Tpng images/" + name + ".dot -o images/" + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename='images/' + name + '.png')

# A function to plot the data
def Plot_Data(data, v1, v2, tv):
    # Make the plot square
    plt.rcParams['figure.figsize'] = [12.0, 8.0]
    
    # Color
    color = ["red" if x == 0 else "blue" for x in data[tv]]
    
    # Plot and label
    plt.scatter(data[v1], data[v2], c=color, s=50)
    plt.xlabel(v1)
    plt.ylabel(v2)
    plt.xlim([min(data[v1]) - 1, max(data[v1]) + 1])
    plt.ylim([min(data[v2]) - .05, max(data[v2]) + .05])
    


In [3]:
def entropy(target):
    # Get the number of users
    n = len(target)
    # Count how frequently each unique value occurs
    counts = np.bincount(target).astype(float)
    # Initialize entropy
    entropy = 0
    # If the split is perfect, return 0
    if len(counts) <= 1 or 0 in counts:
        return entropy
    # Otherwise, for each possible value, update entropy
    for count in counts:
        entropy += math.log(count/n, len(counts)) * count/n
    # Return entropy
    return -1 * entropy

def information_gain(feature, threshold, target):
    # Dealing with numpy arrays makes this slightly easier
    target = np.array(target)
    feature = np.array(feature)
    # Cut the feature vector on the threshold
    feature = (feature < threshold)
    # Initialize information gain with the parent entropy
    ig = entropy(target)
    # For both sides of the threshold, update information gain
    for level, count in zip([0, 1], np.bincount(feature).astype(float)):
        ig -= count/len(feature) * entropy(target[feature == level])
    # Return information gain
    return ig

## Load the data

In [4]:
with open("homework2.csv") as datafile:
    data = pd.read_csv(datafile)

data

Unnamed: 0,check_sum,compile_date,datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size,datadir_IMAGE_DIRECTORY_ENTRY_EXPORT_size,datadir_IMAGE_DIRECTORY_ENTRY_IAT_size,datadir_IMAGE_DIRECTORY_ENTRY_IMPORT_size,datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size,debug_size,export_size,generated_check_sum,...,sec_vasize_upx3,size_code,size_image,size_initdata,size_uninit,std_section_names,total_size_pe,virtual_address,virtual_size,virtual_size_2
0,0,585810474,0,0,44,40,0,0,0,98624,...,0.0,13824,180224,43008,65536,0,85504,4096,13352,65536
1,0,1218437803,0,0,468,100,1048,0,0,53913,...,0.0,4096,20480,12288,0,1,20480,4096,3346,2182
2,98299,1297813288,0,0,372,40,1660,28,0,113512,...,0.0,36864,53248,12288,0,1,64512,4096,33504,5156
3,104924,708992537,4612,0,0,2842,6144,0,0,104924,...,0.0,67072,114688,13824,0,0,81920,4096,66596,456
4,150326,1276781438,188,154,308,180,84152,0,154,150326,...,0.0,6656,110592,89088,0,0,97280,4096,6532,2074
5,0,1378220696,0,0,464,120,0,0,0,100137,...,0.0,45568,81920,26112,0,1,66560,4096,45562,5962
6,84663,1094353745,0,0,0,1280,8360,0,0,48166,...,0.0,2048,40960,17408,0,0,22510,4096,1243,91
7,0,1377863401,0,0,748,160,168,0,0,74165,...,0.0,18944,45056,10752,0,1,29184,4096,18906,4322
8,0,967728842,0,0,560,180,30208,28,0,112179,...,0.0,37888,102400,20480,0,0,72704,4096,37510,18780
9,0,1259933759,0,0,228,100,32768,0,0,68538,...,0.0,8192,53248,12288,0,1,53248,4096,8192,1466
