In [1]:
import numpy as np
import pandas as pd
from numpy import asarray
from numpy import savetxt

In [2]:
def dataExtraction(path):
    df = pd.read_csv(path, header=None).drop_duplicates()
    num_cols = len(df.columns)
    n_bins = 20

    #print(len(df))
    df = df.drop(df[df[num_cols - 1] > 2.0].index)
    #print(len(df))
    df.replace([np.inf, -np.inf], np.nan)
    df.dropna(axis=0)
    #print(len(df))
    
    # Figure out this binning stuff later --> see if you get any useful predictions
    #y = pd.cut(df[num_cols - 1], n_bins, retbins=True)[0]

    mask = np.any(np.isnan(df) | np.equal(df, np.inf) | np.equal(df, -np.inf), axis=1)
    new_data = df[~mask]
    
    # Save the cleaned up data
    final_label = "clean_" + path.split('/')[-1]
    new_path = ""
    for val in path.split('/')[:len(path.split('/'))-1]:
        new_path += val +"/"
    new_path += final_label
    print(new_path)
    savetxt(new_path, new_data, delimiter=',', fmt='%.5f')
    X, y = np.hsplit(new_data, [num_cols - 1])
    
    return X,y

In [3]:
def binning(y, max_ratio = 1.1, intervals = 0.05):
    output_size = int(max_ratio / intervals)
    output_bin = []
    num_rows = y.shape[0]
    num_cols = 1
    
    for data in np.nditer(y):
        index = int(data / intervals)
        output_bin.append(index)
    
    return np.array(output_bin).reshape(num_rows, num_cols)

# Basic tests for the binning (obviously should be way more robust but whatever)
x = np.array([.99, .93, .61]).reshape(3,1)
y_correct = np.array([19, 18, 12]).reshape(3,1)

x1 = np.array([1.0007, .95000005, .949999]).reshape(3,1)
y_correct1 = np.array([20, 19, 18]).reshape(3,1)

print(y_correct == binning(x))
print(y_correct1 == binning(x1))

[[ True]
 [ True]
 [ True]]
[[ True]
 [ True]
 [ True]]


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingClassifier

def crossFoldValidation(X, y, model):
    cv_results = cross_validate(model, X, y, cv=5)

    return cv_results

def random_forest(X,y):
    return RandomForestClassifier(max_depth=10, random_state=0).fit(X, y)

def mlp_classifier(X,y):
    return MLPClassifier(random_state=1, max_iter=300).fit(X, y)

def svm_classifier(X,y):
    return svm.SVC(decision_function_shape='ovo').fit(X, y)

def gradient_boosting(X,y):
    return GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=10).fit(X, y)

In [5]:
# Quick check to see if the binning will work
X,y = dataExtraction('data_sketching/clean_data/snappy.csv')
print("Simd Results: ")
dataExtraction('data_sketching/clean_data/simd.csv')
print("###############################")
print("RLE Results: ")
dataExtraction('data_sketching/clean_data/rle.csv')
print("###############################")
print("Zlib Results: ")
dataExtraction('data_sketching/clean_data/zlib.csv')
print("###############################")
print("Zstandard Results: ")
dataExtraction('data_sketching/clean_data/zstandard.csv')
# y = binning(y)
# rf = random_forest(X,y)
# mlp = mlp_classifier(X,y)
# svm = svm_classifier(X,y)

data_sketching/clean_data/clean_snappy.csv
Simd Results: 
data_sketching/clean_data/clean_simd.csv
###############################
RLE Results: 
data_sketching/clean_data/clean_rle.csv
###############################
Zlib Results: 
data_sketching/clean_data/clean_zlib.csv
###############################
Zstandard Results: 
data_sketching/clean_data/clean_zstandard.csv


(                0             1             2         3         4         5   \
 1     2.041630e-02  2.050020e-02  1.968380e-02  0.019768  0.019920  0.020195   
 2     3.619930e+14  0.000000e+00  3.619930e+14  0.000008  0.000015  0.000031   
 3     4.704310e+09  1.013180e-02  4.704310e+09  0.009521  0.010590  0.009155   
 4     2.062990e-02  2.017210e-02  2.003480e-02  0.019959  0.020477  0.019180   
 5     2.044680e-02  2.117920e-02  1.989750e-02  0.020996  0.019531  0.020081   
 ...            ...           ...           ...       ...       ...       ...   
 1367  1.220700e-04  1.401300e-45  1.220700e-04  0.000122  0.000671  0.001099   
 1368 -1.291780e+08  9.918210e-05  1.754760e-04  0.000252  0.000374  0.000610   
 1369 -8.149290e+21  1.885990e-02 -8.149290e+21  0.020019  0.021210  0.019867   
 1370 -1.248830e+20  2.037050e-02 -1.248830e+20  0.019547  0.019699  0.019775   
 1371 -1.538670e+02  2.008060e-02 -1.538670e+02  0.020538  0.020569  0.019806   
 
             6         7  

In [56]:
import datetime
def run_ml(path):
    X, y = dataExtraction(path)
    y = binning(y)
    rf = random_forest(X,y)
    mlp = mlp_classifier(X,y)
    svm_c = svm_classifier(X,y)
    xgboost = gradient_boosting(X,y)
    
#     rf_scores = crossFoldValidation(X, y, rf)
#     xgboost_scores = crossFoldValidation(X,y,xgboost)
#     mlp_scores = crossFoldValidation(X, y, mlp)
#     svm_scores = crossFoldValidation(X, y, svm_c)
        
#     print("RF Avg: ", sum(rf_scores['test_score'])/5.0)
#     print("XGBoost Avg: ", sum(xgboost_scores['test_score'])/5.0)
#     print("MLP Scores: ", sum(mlp_scores['test_score'])/5.0)
#     print("SVM Scores: ", sum(svm_scores['test_score'])/5.0)
    
    begin_time = datetime.datetime.now()
    print(rf.predict(X))
    print(datetime.datetime.now() - begin_time)
    
    begin_time = datetime.datetime.now()
    print(mlp.predict(X))
    print(datetime.datetime.now() - begin_time)
    
    begin_time = datetime.datetime.now()
    print(svm_c.predict(X))
    print(datetime.datetime.now() - begin_time)
    
    

# print("Simd Results: ")
# run_ml('data_sketching/clean_data/simd.csv')
# print("###############################")
# print("RLE Results: ")
# run_ml('data_sketching/clean_data/rle.csv')
# print("###############################")
# print("Zlib Results: ")
# run_ml('data_sketching/clean_data/zlib.csv')
# print("###############################")
print("Zstandard Results: ")
run_ml('data_sketching/clean_data/zstandard.csv')

Zstandard Results: 


  return RandomForestClassifier(max_depth=10, random_state=0).fit(X, y)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[19 19 12 20 20 12 18 18 13 13 12 18 19 20 19 12 13 18 19 19 18 19 12 12
 19 18 20 18 18 19 18 19 19 18 19 19 18 19 19 18 19 18 19 19 13 20 12 19
 19 12 13 20 19 19 12 19 13 13 19 19 19 19 19 19 19 19 19 20 20 18 19 18
 19 19 19 18 12 19 18 19 19 19 18 18 13 12 18 12 18 18 19 19 19 19 19 19
 19 19 19 19 13 12 18 20 18 19 12 19 20 19 19 19 12 20 19 12 19 18 19 19
 13 13 19 18 12 18 18 18 19 20 12 18 20 18 19 18 18 12 19 19 19 12 19 18
 19 20 18 19 13 12 18 19 20 18 18 18 19 20 19 18 12 20 18 19 12 12 18 13
 19 13 19 20 19 20 19 20 19 20 12 18 18 18 19 13 19 12 19 19 20 20 19 19
 20 12 12 19 18 18 20 18 20 19 12 19 19 19 20 19 12 12 12 12 19 12 19 19
 12 18 19 20 19 19 12 19 18 18 18 18 19 20 19 19 20 19 19 12 20 12 13 19
 19 20 19 18 18 12 19 18 18 18 20 20 19 19 18 18 12 19 18 19 19 19 19 18
 18 18 12 18 13 18 19 20 19 20 12 19 19 19 18 19 19 18 20 19 19 19 13 18
 20 19 19 13 18 18 19 18 20 12 13 19 18 12 19 18 12 13 19 18 19 19 19 12
 18 19 19 19 19 20 19 19 19 19 19 12 20 12 19 20 18