In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from scipy.sparse import csr_matrix
import numpy as np

def read_libsvm(fname, num_features=0):
    '''
        Reads a libsvm formatted data and outputs the training set (sparse matrix)[1], 
        the label set and the number of features. The number of features
        can either be provided as a parameter or inferred from the data.

		Example usage:
		
		X_train, y_train, num_features = read_libsvm('data_train')
		X_test, y_test, _ = read_libsvm('data_test', num_features)

		[1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
	'''
    data = []
    y = []
    row_ind = []
    col_ind = []
    with open(fname) as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            elements = line.split()
            y.append(int(elements[0]))
            for el in elements[1:]:
                row_ind.append(i)
                c, v = el.split(":")
                col_ind.append(int(c))
                data.append(float(v))
    if num_features == 0:
        num_features = max(col_ind) + 1
    X = csr_matrix((data, (row_ind, col_ind)), shape=(len(y), num_features+1))

    return X, np.array(y), num_features


In [None]:
X_train, y_train, num_features = read_libsvm('/kaggle/input/ml-fall2019-android-malware/data/data/data-splits/data.train',360)
X_test, y_test, _ = read_libsvm('/kaggle/input/ml-fall2019-android-malware/data/data/data-splits/data.test', 360)
X_eval, y_eval, _ = read_libsvm('/kaggle/input/ml-fall2019-android-malware/data/data/data-splits/data.eval.anon', 360)
X_train_df = pd.DataFrame(X_train.toarray())
X_test_df= pd.DataFrame(X_test.toarray())
X_eval_df = pd.DataFrame(X_eval.toarray())
X_train_np = X_train_df.to_numpy()
X_test_np=X_test_df.to_numpy()
X_eval_np = X_eval_df.to_numpy()

In [None]:
def initial_setup( l, iteration=100000,flags = True):
    global lr, iters,flag
    lr=l
    iters=iteration
    flag=flags
    
    
def intercept(X): 
    intercept = np.ones((X.shape[0], 1))
    return np.concatenate((intercept, X), axis=1)

def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    
    
def loss(h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
def fit(X, y):
    if flag:
        X = intercept(X)
    global theta
    # weights initialization
    theta = np.zeros(X.shape[1])

    for i in range(iters):
        z = np.dot(X, theta)
        h = sigmoid(z)
        gradient = np.dot(X.T, (h - y)) / y.size
        theta -= lr * gradient

        z = np.dot(X, theta)
        h = sigmoid(z)
        #l = loss(h, y)
    
def prob(X):
        global theta

        if flag:
            X = intercept(X)
    
        return sigmoid(np.dot(X, theta))
    
def predict(X):
    return prob(X).round()


>TRAIN ACCURACIES 

In [None]:
initial_setup(l=0.001, iteration=100000)
fit(X_train_np, y_train)
preds=predict(X_train_np)
preds_new=[]
for v in preds:
    preds_new.append(int(v))


In [None]:
import sklearn.metrics as metrics
precision=metrics.precision_score(y_train, preds_new)
recall=metrics.recall_score(y_train, preds_new)
print("Accuracy:",metrics.accuracy_score(y_train, preds_new))
print("Precision:",metrics.precision_score(y_train, preds_new))
print("Recall:",metrics.recall_score(y_train, preds_new))

TEST ACCURACY

In [None]:
import sklearn.metrics as metrics

acc_list=[]
learn_rate= [0.1,0.01,0.001,0.0001]
its = [10,100,1000,10000,100000]

for l in learn_rate:
    for it in its:
        print("learning rate",l,"----","iterations",it)
        initial_setup(l, it)
        fit(X_train_np, y_train)
        preds=predict(X_test_np)
        preds_new=[]
        for v in preds:
            preds_new.append(int(v))
        precision=metrics.precision_score(y_test, preds_new)
        recall=metrics.recall_score(y_test, preds_new)
        print("Accuracy:",metrics.accuracy_score(y_test, preds_new))
        print("Precision:",metrics.precision_score(y_test, preds_new))
        print("Recall:",metrics.recall_score(y_test, preds_new))
        acc_list.append(metrics.accuracy_score(y_test, preds_new))


In [None]:
its = [10,100,1000,10000,100000]


In [None]:
import matplotlib.pyplot as plt
plt.plot(its,acc_list[:5])
plt.ylabel('Accuracy')
plt.xlabel('Iterations')
plt.show()

plt.plot(its,acc_list[5:10])
plt.ylabel('Accuracy')
plt.xlabel('Iterations')
plt.show()


plt.plot(its,acc_list[10:15])
plt.ylabel('Accuracy')
plt.xlabel('Iterations')
plt.show()

plt.plot(its,acc_list[15:20])
plt.ylabel('Accuracy')
plt.xlabel('Iterations')
plt.show()

In [None]:
precision=metrics.precision_score(y_test, preds_new)
recall=metrics.recall_score(y_test, preds_new)
print("Accuracy:",metrics.accuracy_score(y_test, preds_new))
print("Precision:",metrics.precision_score(y_test, preds_new))
print("Recall:",metrics.recall_score(y_test, preds_new))

In [None]:
F1 = 2 * (precision * recall) / (precision + recall)
F1

> EVAL DATA

In [None]:
preds_2 = predict(X_eval_np)
preds_2


In [None]:
preds_new_e=[]
for v in preds_2:
    preds_new_e.append(int(v))
preds_new_e  

CSV CODE

In [None]:
eval_id_list=[]
with open("/kaggle/input/ml-fall2019-android-malware/data/data/data-splits/eval.id") as f:
    lines = f.readlines()
    
for ele in lines:
    ele=ele.rstrip("\n")
    eval_id_list.append(ele)   

In [None]:
list_of_tuples = list(zip(eval_id_list, preds_new_e))  
    
# Assign data to tuples.  
list_of_tuples   
  
  
# Converting lists of tuples into  
# pandas Dataframe.  
df_new = pd.DataFrame(list_of_tuples, columns = ['example_id', 'label'])  
     
# Print data.  
df_new

In [None]:
df_new.to_csv(r'df_new.csv')
from IPython.display import FileLink
FileLink(r'df_new.csv')