# Batch Training Machine Learning

In [35]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [36]:
df=pd.read_csv('/kaggle/input/features-2015/features_2015_new.csv')

In [37]:
## Code for converting the ransomware address to label 1 and white address 0

'''
for i, row in df.iterrows():
    if row['label']!='white':
        df.at[i, 'label'] = '1'
    else:
        df.at[i, 'label'] = '0'
        
'''

# Feature Selection for Machine Learning Model

In [38]:
df

In [39]:
df=df.drop(['address','day','label'],axis=1)

## Independent Variables

In [40]:
X = df.iloc[:,:-1] #returns a numpy array

In [41]:
X

## Dependent Variables

In [42]:
y=df['label_final']

In [43]:
y.value_counts()

In [44]:
## As we can see that the ratio of white address to ransomware address is approximately 50:1, So thats a total imbalanced dataset.
## To solve the problem we have done oversampling in the dataset to increase the dataset by increasing the records that contains ransomware dataset

## OverSampling the Dataset

In [45]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
# instantiating the random oversampler 
ros = RandomOverSampler()
# resampling X, y
X, y = ros.fit_resample(X, y)

# new class distribution 
print(Counter(y))

## Normalizing the Dataset

In [46]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
X=x_scaled

## Splitting the Dataset into train and test

In [47]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## XG Boost CLassifier

In [48]:
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

In [49]:
# make predictions for test data
y_pred = model.predict(X_test)

## Accuracy Score`

In [50]:
from sklearn.metrics import accuracy_score
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Confusion Matrix

In [51]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

## Precision

In [52]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average='macro')

## Recall

In [53]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test,y_pred)

## Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

In [55]:
model = LogisticRegression()
model = model.fit(X_train,y_train)

In [56]:
# make predictions for test data
y_pred = model.predict(X_test)

In [57]:
from sklearn.metrics import accuracy_score
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [58]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [59]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average='macro')

In [60]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test,y_pred)

## Random Forest

In [61]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier().fit(X_train, y_train)

In [62]:
# make predictions for test data
y_pred = model.predict(X_test)

In [63]:
from sklearn.metrics import accuracy_score
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [64]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [65]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average='macro')

In [66]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test,y_pred)

## Sliding Window

In [82]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [83]:
df=pd.read_csv('/kaggle/input/features-2015/features_2015_new.csv')

In [61]:
df=df.sort_values(by="day", axis=0, ascending=True, inplace=False)
df.reset_index(drop=True, inplace=True)
df1=df.copy()

In [85]:
day_list=df1['day'].unique().tolist()

In [64]:
day_list[:4]

In [87]:
df_new=df1[df1['day'].isin(day_list[:4])]

In [88]:
df_new
#df_new contains data with first 4 days

In [66]:
del day_list[:4]
# We delete the first 4 unique days from the list

In [69]:
for i in day_list:
    f_list=df_new['day'].unique().tolist()
    #f_list is list that contains unique values of days in the dataframe df_new
    last_n=f_list[-4:]
    #last_n contains window size of 4
    df_new1=df_new[df_new['day'].isin(last_n)]
    #df_new1 contains datframe with the rows that contains last 4 days
    X=df_new1[['sum in-degree','sum out-degree (BTCH)','sum in-amount','sum out-amount (BTCH)','count (BTCH)','weight (BTCH)']]
    # X is the feture
    y=df_new1['label_final']
    # y is the label
    model = RandomForestClassifier().fit(X,y)
    # Training the model
    predict_df=df1[df1['day']==i]
    predict_df=predict_df[['sum in-degree','sum out-degree (BTCH)','sum in-amount','sum out-amount (BTCH)','count (BTCH)','weight (BTCH)']]
    # predict_df contains features of the next day records
    predict_labels=model.predict(predict_df)
    # predict_labels is used to predict on the next day features
    df_temp=df1[df1['day']==i]
    # df_temp contains data with the next day records
    df_temp.drop(['label_final'],axis=1)
    # We drop the original label for the dataset
    df_temp['label_final']=predict_labels
    # We put the predicted labels with dataset
    df_new=pd.concat([df_new,df_temp],axis=0,ignore_index=True)
    # Appending the data by adding records for the next day that contains both features and labels

## Accuracy

In [75]:
from sklearn.metrics import accuracy_score
# evaluate predictions
accuracy = accuracy_score(df['label_final'], df_new['label_final'])
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [76]:
df_new['label_final'].value_counts()

## Precision

In [77]:
from sklearn.metrics import precision_score
precision_score(df['label_final'], df_new['label_final'], average='macro')

## Recall

In [79]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(df['label_final'], df_new['label_final'], average='macro')

## Confusion Matrix

In [81]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df['label_final'], df_new['label_final'])

## Expanding Window

In [90]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [91]:
df=pd.read_csv('/kaggle/input/features-2015/features_2015_new.csv')
df=df.sort_values(by="day", axis=0, ascending=True, inplace=False)
df.reset_index(drop=True, inplace=True)
df1=df.copy()
day_list=df1['day'].unique().tolist()
day_list[:4]
df_new=df1[df1['day'].isin(day_list[:4])]
del day_list[:4]

In [92]:
for i in day_list:
    X=df_new[['sum in-degree','sum out-degree (BTCH)','sum in-amount','sum out-amount (BTCH)','count (BTCH)','weight (BTCH)']]
    y=df_new['label_final']
    model = RandomForestClassifier().fit(X,y)
    predict_df=df1[df1['day']==i]
    predict_df=predict_df[['sum in-degree','sum out-degree (BTCH)','sum in-amount','sum out-amount (BTCH)','count (BTCH)','weight (BTCH)']]
    predict_labels=model.predict(predict_df)
    df_temp=df1[df1['day']==i]
    df_temp.drop(['label_final'],axis=1)
    df_temp['label_final']=predict_labels
    df_new=pd.concat([df_new,df_temp],axis=0,ignore_index=True)
    

## Accuracy

In [93]:
from sklearn.metrics import accuracy_score
# evaluate predictions
accuracy = accuracy_score(df['label_final'], df_new['label_final'])
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Precision

In [94]:
from sklearn.metrics import precision_score
precision_score(df['label_final'], df_new['label_final'], average='macro')

## Recall

In [96]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(df['label_final'], df_new['label_final'], average='macro')

## Confusion Matrix

In [95]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df['label_final'], df_new['label_final'])

In [97]:
df_new['label_final'].value_counts()