In [None]:
#Importing Libraries

import os
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.utils

import seaborn as sns

In [None]:
def load_data(filename):
    """
    To load Mining Pool address data from csv file
    
    Input:
    filename -> A string that represents where the datafile can be located
    
    Output:
    data -> Dataframe of Mining Pool address data
    """
    
    #Load the data
    data = pd.read_csv(filename)
    
    return data

In [None]:
# Loading data
# data files for is_miner=True and is_miner=False are extracted from google big query
# more details in document file

data_true = load_data("mining_pool_data_true.csv")  # is_miner = true
data_false = load_data("mining_pool_data_false.csv") # is_miner = false

print(data_true.shape)
print(data_false.shape)

In [None]:
data_true.head()

In [None]:
data_false.head()

In [None]:
# combining both data_true and data_false and shuffle the resultant data

frames = [data_true, data_false]
data = pd.concat(frames, join='inner', ignore_index = True)

data = sklearn.utils.shuffle(data)

In [None]:
#Store combined data
data.to_csv('mining_data_final.csv') 

In [None]:
# Analysis of data
#Exploratory data analysis
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# To check null values in data
data.apply(lambda x : sum(x.isnull()), axis=0)

#no null value found except in stddev_output_idle_time and stddev_input_idle_time  

In [None]:
#Drop stddev_output_idle_time  and stddev_input_idle_time 
#(as have null values and data have mean_output_idle_time and mean_input_idle_time)

data.drop(labels=['stddev_output_idle_time','stddev_input_idle_time'], axis=1, inplace=True)

In [None]:
data.info()

In [None]:
# Correaltional Matrix

data_temp = data.drop(labels=['is_miner','address'], axis=1)
corr = data_temp.corr()

ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0,
                cmap=sns.diverging_palette(20,220, n=200),
                square=True, linewidths=0.005)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right');

plt.savefig('mining_corr.png', bbox_inches='tight', pad_inches=0.0)

In [None]:
data['input_active_months'].hist(bins=20)

In [None]:
data['output_active_months'].hist(bins=20)

In [None]:
# Training testing split
features = data.drop(labels=['is_miner','address'], axis=1)
#print(data.columns)

#print(features.columns)
target = data['is_miner'].values
indices = range(len(features))

#print(target)
#print(indices)

#Train test split
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, target, indices,  test_size=0.2)

#print(indices_train)
#print(indices_test)

In [None]:
rf = LogisticRegression()
start = time.time()
rf.fit(X_train, y_train)
training_time = int(math.floor(time.time()-start))
print(training_time)

In [None]:
y_pred = rf.predict(X_test) 

In [None]:
score = rf.score(X_test, y_test)
print(score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
#data points where model predicts true, but are labelled as false
false_positives = (y_test==False) & (y_pred==True)


In [None]:
#subset to test set data only
data_test = data.iloc[indices_test, :]

print('False Positive addresses')

#subset test set to false positives only
print(data_test.iloc[false_positives].shape)

data_test.iloc[false_positives].head(15)

