In [60]:
import time, datetime
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

class Printer:
    
    def __init__(self, logFile, initTime=time.time()):
        self.initTime = initTime
        self.log = self.LogInit(logFile) if logFile is not None else None
        self.Print('Printer Inited')
    
    def LogInit(self, filename, mode='w+'):
        return open(filename, mode)
    
    def LogClose(self,):
        self.log.close()
    
    def Print(self, msg, end='\n', timeInput=True, log=True):
        output = ' {}'.format(msg) if timeInput == False else '[{} ({:9.3f} Sec)] {}'.format(str(datetime.datetime.now()), time.time() - self.initTime, msg)
        print(output, end=end)
        if log:
            self.log.write(output + end)
        
    def PrintJudgeResult(self, true, predict):
        accuracy = metrics.accuracy_score(true, predict)
        self.Print('Accuracy: ', end='')
        self.Print(accuracy, timeInput=False)
        accuracyNor = metrics.accuracy_score(true, predict, normalize=False)
        self.Print('Accuracy(Sameples):', end='')
        self.Print(accuracyNor, timeInput=False)

        fpr, tpr, thresholds = metrics.roc_curve(true, predict)
        self.Print('ROC curve:')
        self.Print('-- TPR = TP / (TP + FN): ', end='')
        self.Print(tpr, timeInput=False)
        self.Print('-- FPR = FP / (FP + TN): ', end='')
        self.Print(fpr, timeInput=False)
        self.Print('-- Thresholds: ', end='')
        self.Print(thresholds, timeInput=False)

        auc = metrics.auc(fpr, tpr)
        self.Print('AUC: ', end='')
        self.Print(auc, timeInput=False)
        
    def DrawScatter(self, X, Y, Z, C, xlabel, ylabel, zlabel, figsize=(8, 6), dpi=400):
        fig = plg.figure(figsize=figsize, dpi=dpi)
        ax = Axes3D(fig)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_zlabel(zlabel)
        ax.scatter(X, Y, Z, C)
        plt.show()

In [56]:
from os import listdir
from os.path import join, isfile, isdir
import pandas as pd

class Filer:
    
    def __init__(self, printer):
        self.printer = printer
        self.printer.Print('Filer Inited.')
    
    def getFiles(self, filePath):
        files = []
        if(isfile(filePath)):
            return [filePath]
        for f in listdir(filePath):
            if isfile(join(filePath, f)):
                files.append(join(filePath, f))
            elif isdir(join(filePath, f)):
                files.extend(self.getFiles(join(filePath, f)))
        return files
    
    def concatFiles(self, filesPath):
        self.printer.Print('Concating files from path: %s' % filesPath)
        folder = filesPath
        files = self.getFiles(filesPath)
        colDataNames = ['FileID', 'CustomerID', 'QueryTS', 'ProductID']
        colVirNames = ['FileID', 'VirusRate']
        count = 0
        
        _data = pd.DataFrame()
        for filename in files:
            self.printer.Print('-- (File %d) %s concating...' %  (count + 1, filename), end='')
            _read = pd.read_csv(filename, names=colDataNames, dtype={'FileID': str, 'CustomerID': str, 'ProductID': str})
            _data = pd.concat([_data, _read], axis=0)
            del _read
            self.printer.Print('%s down.' % filename, timeInput=False)
            count += 1
        self.printer.Print('Files concated.')
        return _data

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model, metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib

trainBool = True
testBool = False
modelFile = './model_saves/Sklearn/SkLogisticReg.pkl'
logFile = './logs/SkLogisticReg_log_%s.log' % str(time.time())
printer = Printer(logFile)
filer = Filer(printer=printer)

if trainBool:
    _data03 = filer.concatFiles('./data/train_data/_03/')
    _train = pd.read_csv('./data/training-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float})
    excTrain = pd.read_csv('./data/exception/exception_train.txt', names=['FileID'])
    _train = train.loc[pd.merge(train, excTrain, how='left', on='FileID', indicator=True)['_merge'] == 'left_only']
    df = pd.merge(_data03.copy(True), _train, how='left', on='FileID')
    df = df[df.VirusRate.notnull()]
    dfOrigin = df.copy(True)

if testBool:
    _data04 = filer.concatFiles('./data/train_data/_04/')
    #_test = pd.read_csv('./data/testing-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float})
    _train = pd.read_csv('./data/training-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate': float})
    excTrain = pd.read_csv('./data/exception/exception_train.txt', names=['FileID'])
    _train = train.loc[pd.merge(train, excTrain, how='left', on='FileID', indicator=True)['_merge'] == 'left_only']
    df2 = pd.merge(_data04.copy(True), _train, how='left', on='FileID')
    df2Origin = df2.copy(True)

if trainBool:
    df.CustomerID = LabelEncoder().fit_transform(df.CustomerID)
    df.ProductID = LabelEncoder().fit_transform(df.ProductID)
    #train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=['FileID', 'VirusRate']), df.VirusRate, test_size=0.1)
    train_X = df.drop(columns=['FileID', 'VirusRate'])
    train_y = df.VirusRate

if testBool:
    df2.CustomerID = LabelEncoder().fit_transform(df2.CustomerID)
    df2.ProductID = LabelEncoder().fit_transform(df2.ProductID)
    test_X = df2.drop(columns=['FileID', 'VirusRate'])
    test_y = df2.VirusRate

#logistic = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
logistic = linear_model.LogisticRegression(solver='sag')
logistic = joblib.load(modelFile) if isfile(modelFile) else logistic
printer.Print('Model inited.')
if trainBool:
    printer.Print('Training...')
    logistic_model = logistic.fit(train_X, train_y)
    joblib.dump(logistic, modelFile)
    printer.Print('Model saved.')
    
if testBool:
    logistic_model = logistic
    test_y_predict = logistic_model.predict(test_X)
    test_y_proba = logistic_model.predict_proba(test_X)
    dfVirusRate = pd.DataFrame({'VirusRate': test_y_proba[:,:1].flatten()})
    whole = pd.merge(test_X, dfVirusRate, left_index=True, right_index=True)

    printer.Print('Coef: ', end='')
    printer.Print(logistic_model.coef_, timeInput=False)
    printer.Print('Intercept: ', end='')
    printer.Print(logistic_model.intercept_, timeInput=False)
    printer.Print('Predict: ', end='')
    printer.Print(test_y_predict, timeInput=False)
    printer.Print('')
    printer.Print('Whole:', timeInput=False)
    printer.Print(whole, timeInput=False)

    printer.PrintJudgeResult(test_y, test_y_predict)

printer.Print('Done')
printer.LogClose()


[2018-02-01 06:56:20.118283 ( 1398.546 Sec)] Printer Inited
[2018-02-01 06:56:20.119319 ( 1398.547 Sec)] Filer Inited.
[2018-02-01 06:56:20.119596 ( 1398.547 Sec)] Concating files from path: ./data/train_data/_03/
[2018-02-01 06:56:20.242206 ( 1398.670 Sec)] -- (File 1) ./data/train_data/_03/0322.csv concating...

In [None]:
from os import listdir
from os.path import join, isfile, isdir
import time, datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing, linear_model, metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib


initTime = time.time()
def overridePrint(msg, end='\n', timeInput=True):
    print(' {}'.format(msg) if timeInput == False else '[{} ({:9.3f} Sec)] {}'.format(str(datetime.datetime.now()), time.time() - initTime, msg), end=end)

def getFiles(filePath):
    files = []
    if(isfile(filePath)):
        return [filePath]
    for f in listdir(filePath):
        if isfile(join(filePath, f)):
            files.append(join(filePath, f))
        elif isdir(join(filePath, f)):
            files.extend(getFiles(join(filePath, f)))
    return files

modelFile = './model_saves/Sklearn/SkLogisticReg.pkl'

overridePrint('Inited...')
folder = './data/train_data/_05/'
files = getFiles(folder)
columns = ['FileID', 'CustomerID', 'QueryTS', 'ProductID']
count = 0
_data = pd.DataFrame()
overridePrint('Processing Path: %s' % folder)
overridePrint('Loading %d files: ' % len(files))
for filename in files:
    overridePrint('-- (File %d) %s concating...' %  (count + 1, filename), end='')
    _read = pd.read_csv(filename, names=columns, dtype={'FileID': str, 'CustomerID': str, 'ProductID': str})
    _data = pd.concat([_data, _read], axis=0)
    del _read
    overridePrint('%s down.' % filename, timeInput=False)
    count += 1

_test = pd.read_csv('./data/testing-set.csv', names=['FileID', 'VirusRate'], dtype={'FileID': str, 'VirusRate':float})
df = pd.merge(_data.copy(True), _test, how='left', on='FileID')
df = df[df.VirusRate.notnull()]
dfOrigin = df.copy(True)

df.CustomerID = LabelEncoder().fit_transform(df.CustomerID)
#df.CustomerID = preprocessing.minmax_scale(df.CustomerID, feature_range=(0, 1))
df.QueryTS = preprocessing.minmax_scale(df.QueryTS, feature_range=(0, 1))
df.ProductID = LabelEncoder().fit_transform(df.ProductID)
#df.ProductID = preprocessing.minmax_scale(df.ProductID, feature_range=(0, 1))

test_X = df
test_y = df.VirusRate

logistic = linear_model.LogisticRegression(solver='sag')
logistic = joblib.load(modelFile) if isfile(modelFile) else logistic
logistic_model = logistic
test_y_predict = logistic_model.predict(df.drop(columns=['FileID', 'VirusRate']))
test_y_proba = logistic_model.predict_proba(df.drop(columns=['FileID', 'VirusRate']))
dfVirusRate = pd.DataFrame({'VirusRate': test_y_proba[:,:1].flatten()})
whole = pd.merge(test_X.drop(columns=['VirusRate']), dfVirusRate, left_index=True, right_index=True)

print(logistic_model.coef_)
print(logistic_model.intercept_ )
print(test_y_predict)
print(test_y_proba)
print('Whole')
print(whole)

#accuracy = metrics.accuracy_score(test_y, test_y_predict)
#print(accuracy)
whole.drop(columns=['CustomerID', 'QueryTS', 'ProductID']).to_csv('./result/sklearn_result.csv', sep=',', index=False)

print('Done')
