# Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
root_path = "/content/drive/My Drive/Backend/"

In [0]:
import numpy as np
from collections import Counter

In [0]:
def get_data():
    filename = []
    type = []

    for x in list(os.walk(root_path+"Files/"))[0][-1]:
        filename.append(x)
        type.append(x.split(".")[-1].lower())

    print(Counter(type))
    idx = list(range(0,len(filename)))
    
    filename = np.array(filename)
    type = np.array(type)
    
    np.random.shuffle(idx)
    filename = filename[idx]
    type = type[idx]

    return filename, type

In [103]:
t, u = get_data()

Counter({'docx': 224, 'pdf': 182, 'doc': 169, 'jpg': 147, 'sldprt': 143, 'csv': 136, 'png': 117, 'cbr': 94, 'a': 78, 'pptx': 60, 'ppt': 24, 'sldasm': 19, 'gif': 2, 'jpeg': 1, 'slddrt': 1, 'slddrw': 1})


# Text Processing

In [0]:
import textwrap
import numpy as np
import tensorflow as tf

In [0]:
def feature_extractor(filename):
    string = ""  
    with open(filename, "rb") as file:
        string = str("{0:b}".format(int.from_bytes(file.read(), byteorder='big')))
    file.close()
    
    return list(map(lambda x:int(x,2), textwrap.wrap(string, 12))), (filename.split(".")[-1]).lower()

In [0]:
p_update = {
    "theta1": 0,
    "theta2": 0,
    "phi1": 0,
    "phi2": 0
}

In [0]:
def data_sampler(data_array, samples, min_sample_size, max_sample_size, p_update):
    
    theta1 = p_update["theta1"] + len(data_array)//2
    theta2 = p_update["theta2"] + len(data_array)//10
    
    indices = np.random.normal(loc = theta1, 
                               scale = theta2, 
                               size = (samples,1)).astype(np.int32)
    
    phi1 = p_update["phi1"] + (min_sample_size + max_sample_size)//2
    phi2 = p_update["phi2"] + np.ceil((max_sample_size - min_sample_size)/np.sqrt(samples))
    
    sizes = np.clip(np.random.normal(loc = phi1, 
                             scale = phi2, 
                             size = (samples,1)).astype(np.int32), min_sample_size, max_sample_size)
    
    param = {
                "theta1": theta1, 
                "theta2": theta2,
                "phi1": min_sample_size,
                "phi2": max_sample_size,
                "dist1": "normal",
                "dist2": "normal"
            }
    
    return np.array(list(map(lambda st,sz: data_array[st[0]:st[0]+sz[0]], indices,sizes))), param

In [0]:
def padding(samples, max_sample_length):
    return list(map(lambda x: np.array((x+["0"]*(max_sample_length-len(x)))[:max_sample_length]), samples))

# Modelling

In [0]:
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer,Conv1D,GRU,Dense

In [0]:
class MeanLayer(Layer):

    """ Identity transform layer that Flatten the result of Conv1D Layer.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(MeanLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):
        kl_batch = K.sqrt(K.mean(K.square(inputs), axis=-1))
        self.add_loss(K.mean(K.reshape(kl_batch, (kl_batch.shape[0], -1)), axis=-1) , inputs=kl_batch)
        return kl_batch

In [0]:
Time_steps=100
feature_length=4096

In [0]:
model=Sequential()
model.add(InputLayer(input_shape=(Time_steps, feature_length), batch_size=8))

model.add(GRU(Time_steps,return_sequences=True))
model.add(GRU(Time_steps,return_sequences=True))

model.add(Conv1D(filters=100, kernel_size=32))
model.add(Conv1D(filters=50, kernel_size=32))

model.add(MeanLayer())

model.add(Dense(16))
model.add(Dense(num_of_classes, activation="sigmoid"))

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_4 (GRU)                  (8, 100, 100)             1259100   
_________________________________________________________________
gru_5 (GRU)                  (8, 100, 100)             60300     
_________________________________________________________________
conv1d_4 (Conv1D)            (8, 69, 100)              320100    
_________________________________________________________________
conv1d_5 (Conv1D)            (8, 38, 50)               160050    
_________________________________________________________________
mean_layer_2 (MeanLayer)     (8, 38)                   0         
_________________________________________________________________
dense_4 (Dense)              (8, 16)                   624       
_________________________________________________________________
dense_5 (Dense)              (8, 4)                    68        
Total para

# Feathure Extraction

In [0]:
min_sample_length = 50
max_sample_length = 100

In [0]:
file_type_dict = {
    'docx': 0, 
    'pdf': 1, 
    'doc': 2, 
    'jpg': 3,
    'gif': 3, 
    'jpeg': 3, 
    'sldprt': 4,
    'slddrt': 4, 
    'slddrw': 4,
    'sldasm': 4,
    'csv': 5, 
    'png': 6, 
    'cbr': 7, 
    'a': 8, 
    'pptx': 9, 
    'ppt': 9
}

In [0]:
def model_data(file_list, type_list, n_samples, min_sample_length, max_sample_length, p_update):
  data, type = [], []
  
  for i in range(len(file_list)):
      data_arr = feature_extractor(root_path + "Files/"+ file_list[i])
      feat = data_sampler(data_arr[0], n_samples, min_sample_length, max_sample_length, p_update)

      data = data + padding(feat[0], max_sample_length)
      type = type + [file_type_dict[type_list[i]]]*n_samples

  data = np.array(data)
  type = np.array(type)

  t = np.array(range(0,80))
  np.random.shuffle(t)
  
  return data[t], type[t], feat[1]

In [0]:
model1=model
model1.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])

# Training

In [0]:
"""
def clone(model):
  model_copy= keras.models.clone_model(model1)
  model_copy.build((None, 10)) # replace 10 with number of variables in input layer
  model_copy.compile(optimizer='rmsprop', loss='categorical_crossentropy')
  model_copy.set_weights(model.get_weights())
"""

In [0]:

cost = 1e12

param = 0
model_copy = model



In [0]:
name = -1

In [0]:
from tensorflow.keras.models import load_model

In [94]:
mdir = list(os.walk(root_path+"Model/"))[0][-1]
if len(mdir) != 0:
    model = load_model("{}Model/{}.h5".format(root_path, name))

0

In [0]:
datax, typex, param = 0, 0, 0
for i in range(0,len(t),10):
        datax, typex, param = model_data(
                        file_list = t[i:min(i+10, len(t))],
                        type_list = u[i:min(i+10, len(t))],
                        n_samples = 20,
                        min_sample_length = min_sample_length,
                        max_sample_length = max_sample_length,
                        p_update = p_update)
        break

In [89]:
for epoch in range(epochs(10)):
    for i in range(0,len(t),10):
        datax, typex, param = model_data(
                        file_list = t[i:min(i+10, len(t))],
                        type_list = u[i:min(i+10, len(t))],
                        n_samples = 20,
                        min_sample_length = min_sample_length,
                        max_sample_length = max_sample_length,
                        p_update = p_update)
          x_data = []

          for i in range(datax.shape[0]):
              x_data.append(np.eye(4096)[datax[i].astype(np.int32)])

          x_data = np.array(x_data)
          y_data = np.eye(num_of_classes)[typex]
          model_copy.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
          m=model_copy.fit(x_data,y_data)
          d=m.history["loss"]
          cc_loss=(d[0][0]+d[0][1]+d[0][2]+d[0][3])/4

          if cc_loss < cost:
              cost=cc_loss
              model=tf.keras.models.clone_model(
                model_copy,
                input_tensors=None
            )

              model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
              model.set_weights(model_copy.get_weights())
              p_update = {
                            "theta1": np.random.randint(-10, 11),
                            "theta2": np.random.randint(-50, 51),
                            "phi1": np.random.randint(-5, 6),
                            "phi2": np.random.randint(-5, 6)
                            }


        if i%50==0 and i!=0:
          print(i, "model Saved.....\n\n\n")
          model.save(root_path + '/Model/my_model{}.h5'.format((name + 1)%5)

Index:    0
['2338.doc' '2270.pdf'
 'twin_collective_magnet_holder_6x6x4mm_twin_collective07.SLDPRT'
 'EmniyetValfi_1.SLDASM' 'wavelet.pptx' 'Basic Concepts in Sociology.pptx'
 'GM132M4_1.SLDASM'
 'twin_collective_head_i2c_connector_housing_twin_collective15.SLDPRT'
 'exoplanets.1382366914.csv' '2080.docx']
['doc' 'pdf' 'sldprt' 'sldasm' 'pptx' 'pptx' 'sldasm' 'sldprt' 'csv'
 'docx']


Index:   10
['EmniyetValfi_2c.SLDPRT' '1994.pdf' 'vecfields_UT2 (1).ppt'
 'exoplanets.1345642951.csv' '1865.docx' 'exoplanets.1344525839.csv'
 '2190.doc' '1978.docx'
 'Augmented CycleGAN - Learning Many-to-Many Mappings from Unpaired Data.pdf'
 '1987.docx']
['sldprt' 'pdf' 'ppt' 'csv' 'docx' 'csv' 'doc' 'docx' 'pdf' 'docx']


Index:   20
['You Only Look Once - Unified, Real Time Object Detection.pdf' '2196.doc'
 '35Ford.jpg' '2016.docx' 'images.jpg' 'libadsiisex.a' '2334.pdf'
 'GM225M4_05.SLDPRT' 'solosvx.jpg' 'AlcanWTmap.jpg']
['pdf' 'doc' 'jpg' 'docx' 'jpg' 'a' 'pdf' 'sldprt' 'jpg' 'jpg']


Index:   30

In [0]:
print("Last Saved Model was {}".format(name))

In [0]:
!free -mh

              total        used        free      shared  buff/cache   available
Mem:            12G        1.1G        9.8G        904K        1.8G         11G
Swap:            0B          0B          0B


In [0]:
x_data.shape

In [0]:
model.fit(x_data, y_data,epochs=10)