In [1]:
import pandas as pd
import numpy as np
import PyPDF2
import textract
import pdfplumber
import base64
import re
import io
import os
from collections import Counter
from os import path
from glob import glob  
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import roc_curve


In [2]:
'''
#Basic Code Snippet that is used to extract structure of the pdf
#From this we identify keywords with / key before them using the regex.
#Uncomment and run this to see the raw pdf data

import base64
import re
import io

filename = 'MaliciousPDF.pdf'
filenameV2 = 'MaliciousPDF.txt'
encoding = 'utf-8'
regex = '\/[^\s\n\r]+\s'

with open(filename, mode="r",encoding='utf-8',errors='ignore') as pdf_file:
    encoded_string = pdf_file.readlines()
    print(encoded_string)
    
    with io.open(filenameV2, "w", encoding="utf-8") as f:
        f.writelines(encoded_string)

'''

'\n#Basic Code Snippet that is used to extract structure of the pdf\n#From this we identify keywords with / key before them using the regex.\n#Uncomment and run this to see the raw pdf data\n\nimport base64\nimport re\nimport io\n\nfilename = \'MaliciousPDF.pdf\'\nfilenameV2 = \'MaliciousPDF.txt\'\nencoding = \'utf-8\'\nregex = \'\\/[^\\s\n\r]+\\s\'\n\nwith open(filename, mode="r",encoding=\'utf-8\',errors=\'ignore\') as pdf_file:\n    encoded_string = pdf_file.readlines()\n    print(encoded_string)\n    \n    with io.open(filenameV2, "w", encoding="utf-8") as f:\n        f.writelines(encoded_string)\n\n'

In [43]:
# Here we use regex to find all possible keywords and 
# Then return the ones without any special character or digit
# This is feature extraction part.

regex = '(endobj|obj|endstream|stream|startxref|xref|trailer|\/[^\s\n\r]+)'

def FilteringFunction(match):
    return match[1:].isalnum()

def FilteringFunctionOnlyAlphabets(match):
    return match[1:].isalpha()

def GetKeywords(fileNamePDF) :
    
    with open(fileNamePDF, mode="r",encoding='utf-8',errors='ignore') as pdf_file:
        #print(fileNamePDF)
        encoded_string = pdf_file.read()
        matches = re.findall(regex, encoded_string) 
        filteredMatches = list(filter(FilteringFunctionOnlyAlphabets, matches))
        return filteredMatches, len(encoded_string)

def weightage(number_of_times_word_appeared, textLength, number_of_documents=1):
    
    tf = number_of_times_word_appeared/float(textLength)
    idf = np.log((number_of_documents)/float(number_of_times_word_appeared))
    tf_idf = tf*idf
    return tf,idf ,tf_idf

def GetKeywordVector(matches, textLength):
    
    matchesCount = Counter(matches)
    data_items = matchesCount.items()
    data_list = list(data_items)
    df = pd.DataFrame(data_list)
    
    df['tf'] = df[1].apply(lambda x: weightage(x,textLength)[0])
    df['idf'] = df[1].apply(lambda x: weightage(x,textLength)[1])
    df['tf_idf'] = df[1].apply(lambda x: weightage(x,textLength)[2])
    df = df.sort_values('tf_idf',ascending=True)
    #print(df)
    return df


KeyWordDictionary = {}
KeyWordIndex = 0
KeyWordList = []

def resetKeyWordContainers():
    global KeyWordDictionary
    global KeyWordIndex
    
    KeyWordDictionary.clear()
    KeyWordIndex = 0

def FillIndex(keyword):
    global KeyWordDictionary
    global KeyWordIndex
    
    if keyword not in KeyWordDictionary:
            KeyWordDictionary[keyword] = KeyWordIndex
            KeyWordIndex = KeyWordIndex + 1
            
def FillIndices(Vector): 
    [FillIndex(item) for item in Vector[0]]

def InsertZeroes(Vector) :
    global KeyWordDictionary
    global KeyWordIndex
    global KeyWordList
    
    index = 0
    indexDict = {}
    newVector = KeyWordIndex*[None]
    
    for indexVar, row in Vector.iterrows():
        indexDict[row[0]] = index
        index = index +1
    
    for key in KeyWordDictionary.keys() :
        
            if key in indexDict:
                newVector[KeyWordDictionary[key]] = Vector.iloc[indexDict[key]].to_list()
            else :
                empty_list = [key, 0,0,0,0]
                newVector[KeyWordDictionary[key]] = empty_list
    
    return newVector
    
def ExtractVectorsFromDirectory(directory, FileExtension):
    files = glob(path.join(directory,"*.{}".format(FileExtension)))
    output_matches = [GetKeywords(i) for i in files]
    output_matchVectors = [GetKeywordVector(i[0],i[1]) for i in output_matches]
    
    # Now we find all the keywords and fill in the missing spots
    resetKeyWordContainers()
    [FillIndices(Vector) for Vector in output_matchVectors]
    KeyWordList = KeyWordDictionary.keys()
    
    output_matchVectorsV2 = [InsertZeroes(Vector) for Vector in output_matchVectors]
    data_array = np.array(output_matchVectorsV2)
    data_arrayValues = data_array[:,:,1]
    return data_arrayValues

def method_chi2(data, labels, features = 20):
    selecter = SelectKBest(score_func=chi2, k=features)
    selecter.fit(data, labels)
    string = selecter.get_support()
    return selecter.transform(data),string

def FeatureReduction():
    data= ExtractVectorsFromDirectory(os.getcwd(),'pdf')
    labels = np.array([1,0])
    ReducedData, selectedFeatures = method_chi2(data, labels,70)
    return  ReducedData, labels

def Classification():
    data, labels = FeatureReduction()
    
    buffer_test = minmax_scale(data,feature_range=(0, 1),axis = 0)
    dataset = np.array(buffer_test , "float32")
    
    classifier = MLPClassifier(max_iter=50000, alpha=1.0, random_state=100,tol=0.000000001)
    X_train, X_test, y_train, y_test = train_test_split(dataset, labels)
    classifier.fit(X_train, y_train)
    print(X_train)
    print(y_train)
    print(X_test)
    print(y_test)
    predictions = classifier.predict(X_test)
    print(predictions)
    
Classification()

  return f(*args, **kwargs)


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
[0]
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[1]
[0]
