In [1]:
import time, datetime
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

class Printer:
    
    def __init__(self, logFile, initTime=time.time()):
        self.initTime = initTime
        self.log = self.LogInit(logFile) if logFile is not None else None
        self.Print('Printer Inited')
    
    def LogInit(self, filename, mode='w+'):
        return open(filename, mode)
    
    def LogClose(self,):
        self.log.close()
    
    def Print(self, msg, end='\n', timeInput=True, log=True):
        output = ' {}'.format(msg) if timeInput == False else '[{} ({:9.3f} Sec)] {}'.format(str(datetime.datetime.now()), time.time() - self.initTime, msg)
        print(output, end=end)
        if log:
            self.log.write(output + end)
        
    def PrintKnownPredictReport(self, true, predict):
        accuracy = metrics.accuracy_score(true, predict)
        self.Print('Accuracy: ', end='')
        self.Print(accuracy, timeInput=False)
        accuracyNor = metrics.accuracy_score(true, predict, normalize=False)
        self.Print('Accuracy(Sameples):', end='')
        self.Print(accuracyNor, timeInput=False)

        fpr, tpr, thresholds = metrics.roc_curve(true, predict)
        self.Print('ROC curve:')
        self.Print('-- TPR = TP / (TP + FN): ', end='')
        self.Print(tpr, timeInput=False)
        self.Print('-- FPR = FP / (FP + TN): ', end='')
        self.Print(fpr, timeInput=False)
        self.Print('-- Thresholds: ', end='')
        self.Print(thresholds, timeInput=False)

        auc = metrics.auc(fpr, tpr)
        self.Print('AUC: ', end='')
        self.Print(auc, timeInput=False)
        
    def DrawScatter(self, X, Y, Z, C, xlabel, ylabel, zlabel, figsize=(8, 6), dpi=400):
        fig = plg.figure(figsize=figsize, dpi=dpi)
        ax = Axes3D(fig)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_zlabel(zlabel)
        ax.scatter(X, Y, Z, C)
        plt.show()

In [65]:
from os import listdir
from os.path import join, isfile, isdir
import pandas as pd

class Filer:
    
    def __init__(self, printer):
        self.printer = printer
        self.printer.Print('Filer Inited.')
    
    def getFiles(self, filePath):
        files = []
        if(isfile(filePath)):
            return [filePath]
        for f in listdir(filePath):
            if isfile(join(filePath, f)):
                files.append(join(filePath, f))
            elif isdir(join(filePath, f)):
                files.extend(self.getFiles(join(filePath, f)))
        return files
    
    def concatFiles(self, filesPath, dtype=None, dropDupSet=None):
        self.printer.Print('Concating files from path: %s' % filesPath)
        if dropDupSet is not None:
            self.printer.Print('Rows will be droped base on duplicate field check: %s' % str(dropDupSet))
        folder = filesPath
        files = self.getFiles(filesPath)
        colDataNames = ['FileID', 'CustomerID', 'QueryTS', 'ProductID']
        colVirNames = ['FileID', 'VirusRate']
        count = 0
        
        _data = pd.DataFrame()
        for filename in files:
            self.printer.Print('- (File %d) %s concating...' %  (count + 1, filename))
            _read = pd.read_csv(filename, names=colDataNames, dtype=dtype)
            _data = pd.concat([_data, _read], axis=0)
            if dropDupSet is not None:
                beforeDropLen = len(_data)
                _data = _data.drop_duplicates(subset=dropDupSet)
                self.printer.Print('-- After concate, original length: %d, after drop length: %s.' % (beforeDropLen, len(_data)))
            del _read
            self.printer.Print('- %s done.' % filename)
            count += 1
        self.printer.Print('Files concated.')
        return _data

In [87]:
import pandas as pd
import numpy as np
from numpy import array

modelFile = './model_saves/Sklearn/SkLogisticReg.pkl'
logFile = './logs/SkLogisticReg_log_%s.log' % str(time.time())
printer = Printer(logFile)
filer = Filer(printer=printer)

#_data = filer.concatFiles('./data/train_data.csv', dropDupSet=['FileID', 'CustomerID', 'ProductID']) # Read whole data
#_data = filer.concatFiles('./data/train_data/_03/0301.csv', dtype={'FileID': str, 'CustomerID':  str, 'QueryTS': int, 'ProductID': str}, dropDupSet=['FileID', 'CustomerID', 'ProductID']) # Read whole data
_data = filer.concatFiles('./data/exam/', dtype={'FileID': str, 'CustomerID':  str, 'QueryTS': int, 'ProductID': str}, dropDupSet=['FileID', 'CustomerID', 'ProductID']) # Read whole data
_train_set = pd.read_csv('./data/training-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float}) # Read sets VirusRate filed file.
excTrain = pd.read_csv('./data/exception/exception_train.txt', names=['FileID'], dtype={'FileID': str}) # Read excepted data
_data = pd.merge(_data, _train_set, how='left', on='FileID') # Merge with set.csv VirusRate field
df = _data.loc[pd.merge(_data, excTrain, how='left', on='FileID', indicator=True)['_merge'] == 'left_only'] # Remove excepted data

df.VirusRate = df.VirusRate.fillna(float(0.5))
df.CustomerID = LabelEncoder().fit_transform(df.CustomerID) # CustomerID - label transform
#df.QueryTS = MinMaxScaler().fit_transform(df.QueryTS.values.reshape(len(df.QueryTS), 1)) # QueryTS - MinMaxScale to 0 - 1 
df.ProductID = LabelEncoder().fit_transform(df.ProductID)
#dummyProductID = pd.get_dummies(df.ProductID)
#dummyProductID = dummyProductID.rename(columns=lambda x: 'ProductID_' + x)
#df = df.drop('ProductID', axis=1)
#df = df.join(dummyProductID) # ProductID - OneHot encode

#unknownRate = df[df.VirusRate == float(0.5)]
#knownRate = df[df.VirusRate != float(0.5)]
#knownRateSample = knownRate.sample(frac=0.2)
train_X = df.drop(labels=['FileID', 'VirusRate'], axis=1)
train_y = df.VirusRate

[2018-02-08 04:51:32.953449 ( 4372.207 Sec)] Printer Inited
[2018-02-08 04:51:32.953449 ( 4372.207 Sec)] Filer Inited.
[2018-02-08 04:51:32.969074 ( 4372.222 Sec)] Concating files from path: ./data/exam/
[2018-02-08 04:51:32.969074 ( 4372.222 Sec)] Rows will be droped base on duplicate field check: ['FileID', 'CustomerID', 'ProductID']
[2018-02-08 04:51:32.969074 ( 4372.222 Sec)] - (File 1) ./data/exam/0301-Copy1.csv concating...
[2018-02-08 04:51:33.656487 ( 4372.910 Sec)] -- After concate, original length: 475569, after drop length: 207224.
[2018-02-08 04:51:33.656487 ( 4372.910 Sec)] - ./data/exam/0301-Copy1.csv done.
[2018-02-08 04:51:33.656487 ( 4372.910 Sec)] - (File 2) ./data/exam/0302-Copy1.csv concating...
[2018-02-08 04:51:34.484622 ( 4373.738 Sec)] -- After concate, original length: 640906, after drop length: 432224.
[2018-02-08 04:51:34.484622 ( 4373.738 Sec)] - ./data/exam/0302-Copy1.csv done.
[2018-02-08 04:51:34.484622 ( 4373.738 Sec)] - (File 3) ./data/exam/0303-Copy1.c

In [88]:
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn import linear_model, metrics
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib

modelFile = './model_saves/Sklearn/SkLogisticReg_%s.pkl' % str(time.time())
logFile = './logs/SkLogisticReg_log_%s.log' % str(time.time())
printer = Printer(logFile)
filer = Filer(printer=printer)

logistic = linear_model.LogisticRegression()
logistic = joblib.load(modelFile) if isfile(modelFile) else logistic
printer.Print('-- Model inited.')
printer.Print('-- Training...')
logistic_model = logistic.fit(df.drop(labels=['FileID', 'VirusRate'], axis=1), train_y)
joblib.dump(logistic, modelFile)
printer.Print('-- Model saved.')

nowRate = df[df.VirusRate != float(0.5)]
pred = logistic_model.predict(nowRate)
prob = logistic_model.predict_proba(nowRate)
dfVirusRate = pd.DataFrame({'VirusRate': prob[:,:1].flatten()})
whole = pd.merge(unknownRate, dfVirusRate, left_index=True, right_index=True)

printer.Print('-- Coef: ', end='')
printer.Print(logistic_model.coef_, timeInput=False)
printer.Print('-- Intercept: ', end='')
printer.Print(logistic_model.intercept_, timeInput=False)
printer.Print('-- Predict: ', end='')
printer.Print(test_y_pre, timeInput=False)
printer.Print('-- Proba: ', end='')
printer.Print(test_y_pro, timeInput=False)
printer.PrintKnownPredictReport(nowRate.VirusRate, test_y_pre)

#plot_decision_regions(knownRate.drop(labels=['FileID', 'VirusRate'], axis=1), knownRate.VirusRate, clf=logistic)
#plt.show()

printer.Print('[-- Done --]')


[2018-02-08 04:51:47.609174 ( 4386.862 Sec)] Printer Inited
[2018-02-08 04:51:47.609174 ( 4386.862 Sec)] Filer Inited.
[2018-02-08 04:51:47.609174 ( 4386.862 Sec)] -- Model inited.
[2018-02-08 04:51:47.609174 ( 4386.862 Sec)] -- Training...


ValueError: Unknown label type: 'continuous'

In [None]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 61942383 entries, 0 to 61942382
Data columns (total 5 columns):
FileID        object
CustomerID    object
QueryTS       int64
ProductID     object
VirusRate     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 15.3 GB

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model, metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.utils import multiclass

trainBool = False
testBool = True
modelFile = './model_saves/Sklearn/SkLogisticReg.pkl'
logFile = './logs/SkLogisticReg_log_%s.log' % str(time.time())
printer = Printer(logFile)
filer = Filer(printer=printer)

if trainBool:
    _data03 = filer.concatFiles('./data/train_data/_04/')
    _train = pd.read_csv('./data/training-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float})
    excTrain = pd.read_csv('./data/exception/exception_train.txt', names=['FileID'])
    _train = _train.loc[pd.merge(_train, excTrain, how='left', on='FileID', indicator=True)['_merge'] == 'left_only']
    df = pd.merge(_data03.copy(True), _train, how='left', on='FileID')
    df = df[df.VirusRate.notnull()]
    dfOrigin = df.copy(True)

if testBool:
    _data04 = filer.concatFiles('./data/train_data/_05/')
    #_test = pd.read_csv('./data/testing-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float})
    _train = pd.read_csv('./data/training-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float})
    excTrain = pd.read_csv('./data/exception/exception_train.txt', names=['FileID'])
    _train = _train.loc[pd.merge(_train, excTrain, how='left', on='FileID', indicator=True)['_merge'] == 'left_only']
    df2 = pd.merge(_data04.copy(True), _train, how='left', on='FileID')
    df2.VirusRate.fillna(float(0.0), inplace=True)
    df2Origin = df2.copy(True)
    
if trainBool:
    df.CustomerID = LabelEncoder().fit_transform(df.CustomerID)
    df.ProductID = LabelEncoder().fit_transform(df.ProductID)
    #train_X, test_X, train_y, test_y = train_test_split(df.drop(labels=['FileID', 'VirusRate'], axis=1), df.VirusRate, test_size=0.1)
    train_X = df.drop(labels=['FileID', 'VirusRate'], axis=1)
    train_y = df.VirusRate

if testBool:
    df2.CustomerID = LabelEncoder().fit_transform(df2.CustomerID)
    df2.ProductID = LabelEncoder().fit_transform(df2.ProductID)
    test_X = df2.drop(labels=['FileID', 'VirusRate'], axis=1)
    test_y = df2.VirusRate

#logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
logistic = linear_model.LogisticRegression(solver='sag')
logistic = joblib.load(modelFile) if isfile(modelFile) else logistic
printer.Print('Model inited.')
if trainBool:
    printer.Print('Training...')
    logistic_model = logistic.fit(train_X, train_y)
    joblib.dump(logistic, modelFile)
    printer.Print('Model saved.')
    
if testBool:
    printer.Print('Predicting...')
    logistic_model = logistic
    test_y_predict = logistic_model.predict(test_X)
    test_y_proba = logistic_model.predict_proba(test_X)
    dfVirusRate = pd.DataFrame({'VirusRate': test_y_proba[:,:1].flatten()})
    whole = pd.merge(test_X, dfVirusRate, left_index=True, right_index=True)
    
    printer.Print('Coef: ', end='')
    printer.Print(logistic_model.coef_, timeInput=False)
    printer.Print('Intercept: ', end='')
    printer.Print(logistic_model.intercept_, timeInput=False)
    printer.Print('Predict: ', end='')
    printer.Print(test_y_predict, timeInput=False)
    printer.Print('')
    printer.Print('Whole:', timeInput=False)
    printer.Print(whole, timeInput=False)

    printer.PrintKnownPredictReport(test_y, test_y_predict)

printer.Print('Done')
printer.LogClose()
