# Generate C code

## Importing necessary libraries

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
import re
import glob
import numpy as np
import random 
import sys
import io
from __future__ import print_function
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import get_file

## 1.Preprocessing data

I am going to use the C code available freely in github from linux(The source code of kernel)
I downloaded the data form github and reuploaded to colab runtime 

### Download,Unzip the file and set the directory

In [15]:
!wget --header 'Host: codeload.github.com' --user-agent 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0' --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' --header 'Accept-Language: en-US,en;q=0.5' --referer 'https://github.com/torvalds/linux/tree/master' --header 'Cookie: _octo=GH1.1.1945079404.1596119282; logged_in=no; _ga=GA1.2.1536439124.1596119283; tz=Asia%2FKolkata; _gat=1' --header 'Upgrade-Insecure-Requests: 1' 'https://codeload.github.com/torvalds/linux/zip/master' --output-document 'linux-master.zip'

--2020-07-30 15:24:12--  https://codeload.github.com/torvalds/linux/zip/master
Resolving codeload.github.com (codeload.github.com)... 140.82.113.9
Connecting to codeload.github.com (codeload.github.com)|140.82.113.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘linux-master.zip’

linux-master.zip        [  <=>               ] 211.22M  8.03MB/s    in 35s     

2020-07-30 15:24:48 (6.03 MB/s) - ‘linux-master.zip’ saved [221481289]



Unzip the file downloaded 

In [16]:
! unzip linux-master.zip 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: linux-master/sound/usb/mixer_s1810c.c  
  inflating: linux-master/sound/usb/mixer_s1810c.h  
  inflating: linux-master/sound/usb/mixer_scarlett.c  
  inflating: linux-master/sound/usb/mixer_scarlett.h  
  inflating: linux-master/sound/usb/mixer_scarlett_gen2.c  
  inflating: linux-master/sound/usb/mixer_scarlett_gen2.h  
  inflating: linux-master/sound/usb/mixer_us16x08.c  
  inflating: linux-master/sound/usb/mixer_us16x08.h  
  inflating: linux-master/sound/usb/pcm.c  
  inflating: linux-master/sound/usb/pcm.h  
  inflating: linux-master/sound/usb/power.c  
  inflating: linux-master/sound/usb/power.h  
  inflating: linux-master/sound/usb/proc.c  
  inflating: linux-master/sound/usb/proc.h  
  inflating: linux-master/sound/usb/quirks-table.h  
  inflating: linux-master/sound/usb/quirks.c  
  inflating: linux-master/sound/usb/quirks.h  
  inflating: linux-master/sound/usb/stream.c  
  inflating: linux-master/s

Set the current working Directory

In [4]:
path = "/content/linux-master"

In [5]:
os.chdir(path)            #Setting the directory

See all the different types of files present 

In [6]:
file_names = os.listdir()
print(file_names)

['.cocciconfig', 'LICENSES', 'include', 'tools', 'mm', 'certs', 'samples', 'scripts', '.gitattributes', '.clang-format', '.get_maintainer.ignore', 'init', 'MAINTAINERS', 'fs', 'Makefile', 'lib', 'ipc', 'net', 'CREDITS', '.gitignore', 'Kbuild', 'security', 'usr', 'drivers', 'sound', 'Kconfig', '.mailmap', 'Documentation', 'crypto', 'arch', 'COPYING', 'virt', 'README', 'kernel', 'block']


### Load the C code 

In [7]:
!ls

arch	 CREDITS	fs	 Kbuild   LICENSES     net	security  virt
block	 crypto		include  Kconfig  MAINTAINERS  README	sound
certs	 Documentation	init	 kernel   Makefile     samples	tools
COPYING  drivers	ipc	 lib	  mm	       scripts	usr


Use glob to find all the file with file names ending with .c

In [8]:
#Find only the C files 
c_files = glob.glob("/content/linux-master/**/*.c", recursive = True)

See the number of files we got 

In [9]:
print(len(c_files))

28974


In [10]:
#Lets load all the c code in a list

#initilize an empty list
codes = list ()
c = 0
for file in c_files:
  c += 1
  if c < 17351: #We will have 17350 files I take thi much files due to memory constraints 

    code = open(file, "r", encoding = 'UTF-8')
    codes.append(code.read())
    code.close()
  else:
    break

In [11]:
#See one file
print(codes[7000])

/*
 *  linux/drivers/scsi/esas2r/esas2r_vda.c
 *      esas2r driver VDA firmware interface functions
 *
 *  Copyright (c) 2001-2013 ATTO Technology, Inc.
 *  (mailto:linuxdrivers@attotech.com)
 */
/*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  NO WARRANTY
 *  THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR
 *  CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT
 *  LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT,
 *  MERCHANTABILITY OR FI

In [12]:
text = "\n".join(codes)
print("The total number of characters in the entire code : {}".format(len(text)))

The total number of characters in the entire code : 359820930


Due to memory limitation we will not load all the text

In [36]:
top_n = 40000
text = text[:top_n]

In [37]:
len(text)

40000

## Convert characters to integers 

In [28]:
#create characters to index mapping
chars = sorted(list(set(text)))
#set will make use of no repeating character present 
#we then convert that set into list by typecasting it and then sort the list 

char_index = dict((c,i) for i,c in enumerate(chars))
#char index will store the character as key and the integer as value 

index_char = dict((i,c) for i,c in enumerate(chars))
#index char will store the integer as key and character as value 

See how does the index_char dictonary looks like 

In [29]:
index_char

{0: '\t',
 1: '\n',
 2: ' ',
 3: '!',
 4: '"',
 5: '#',
 6: '%',
 7: '&',
 8: "'",
 9: '(',
 10: ')',
 11: '*',
 12: '+',
 13: ',',
 14: '-',
 15: '.',
 16: '/',
 17: '0',
 18: '1',
 19: '2',
 20: '3',
 21: '4',
 22: '5',
 23: '6',
 24: '7',
 25: '8',
 26: '9',
 27: ':',
 28: ';',
 29: '<',
 30: '=',
 31: '>',
 32: '?',
 33: '@',
 34: 'A',
 35: 'B',
 36: 'C',
 37: 'D',
 38: 'E',
 39: 'F',
 40: 'G',
 41: 'H',
 42: 'I',
 43: 'K',
 44: 'L',
 45: 'M',
 46: 'N',
 47: 'O',
 48: 'P',
 49: 'R',
 50: 'S',
 51: 'T',
 52: 'U',
 53: 'V',
 54: 'W',
 55: 'X',
 56: 'Y',
 57: 'Z',
 58: '[',
 59: '\\',
 60: ']',
 61: '^',
 62: '_',
 63: 'a',
 64: 'b',
 65: 'c',
 66: 'd',
 67: 'e',
 68: 'f',
 69: 'g',
 70: 'h',
 71: 'i',
 72: 'j',
 73: 'k',
 74: 'l',
 75: 'm',
 76: 'n',
 77: 'o',
 78: 'p',
 79: 'q',
 80: 'r',
 81: 's',
 82: 't',
 83: 'u',
 84: 'v',
 85: 'w',
 86: 'x',
 87: 'y',
 88: 'z',
 89: '{',
 90: '|',
 91: '}',
 92: '~'}

Vocabulary is the total number of unique characters/words present in the dataset
Here vocabulary is defined as the total number of unique characters present in the dataset 

In [30]:
print("Vocabulary size : {}".format(len(chars)))

Vocabulary size : 93


## Divide the data into input and output

In [38]:
# Define length for each question 

MAX_LEN = 100               #Number of input character Xin each sequence 
STEP = 3                    #Increment between each sequence(window size) 
VOCAB_SIZE = len(chars)     #Total number of unique characters 


sentences = []              #X
next_chars = []             #Y

for i in range(0,len(text) - MAX_LEN, STEP ):
    sentences.append(text[i: i + MAX_LEN])
    next_chars.append(text[i + MAX_LEN])

In [39]:
print("Number of training examples : {}".format(len(sentences)))

Number of training examples : 13300


In [40]:
sentences

['// SPDX-License-Identifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n *',
 'SPDX-License-Identifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Co',
 'X-License-Identifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyr',
 'icense-Identifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyrigh',
 'nse-Identifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyright (',
 '-Identifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyright (c) ',
 'entifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyright (c) 201',
 'ifier: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyright (c) 2016 I',
 'er: GPL-2.0-only\n/*\n * multiorder.c: Multi-order radix tree entry testing\n * Copyright (c) 2016 Inte',
 ' GPL-2.0-only\n/*\n * mult

## Create the input and output using created sequence 

Samples: Number of datapoints
Timesteps: Length of seq
features: dimentionality of onehot encoding matrix where each character is being represented 

In [41]:
# Create X and Y
X = np.zeros((len(sentences),MAX_LEN,VOCAB_SIZE), dtype = np.bool )
Y = np.zeros((len(sentences),VOCAB_SIZE), dtype = np.bool )

for i, sentence in enumerate (sentences):
  for t, char in enumerate(sentence):
    X[i,t, char_index[char]] = 1
  Y[i,char_index[next_chars[i]]] = 1

In [43]:
#Print the shape the X and Y
print("Shape of X : {}".format(X.shape))
print("Shape of Y : {}".format(Y.shape))

Shape of X : (13300, 100, 93)
Shape of Y : (13300, 93)


## Lstm

In [44]:
#Lets create a lstm model 
model = Sequential()
model.add(LSTM(128, input_shape =(MAX_LEN,VOCAB_SIZE), return_sequences=True,dropout=0.3))
model.add(LSTM(128, dropout = 0.5))
model.add(Dense(VOCAB_SIZE, activation="softmax"))

optimizer = Adam(lr =0.01 )
model.compile(loss = "categorical_crossentropy", optimizer = optimizer, metrics = ['acc'])

In [45]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 128)          113664    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 93)                11997     
Total params: 257,245
Trainable params: 257,245
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.fit(X,Y,batch_size=128,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7efe10130160>

## Generate code 

Created a function that will make the next haracter predictions based on temperatures. If temperature is greater than 1 the generated characters wil be more versatile and diverse. If temperature is less than 1 the  generated characters will be more conservative

In [47]:
def sample(preds, temperature = 1.0):
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds)/temperature
  exp_preds = np.exp(preds)
  preds = exp_preds/np.sum(exp_preds)
  probas = np.random.multimonial(1,preds,1)
  return np.argmax(probas)


In [49]:
np.random.multinomial(10,[0.05,0.9,0.005], size = 2)

array([[0, 8, 2],
       [1, 9, 0]])

In [None]:
 #generate code
 start_index = random.randint(0, len(text) - MAX_LEN - 1) #picks random code to start

 for diversity in[0.5, 1.0, 1.5]:
   print("-"*50,"diversity:",diversity)
   generated = ''
   sentence = text[start_index: start_index + MAX_LEN]
   generated += sentence
   print("----Generating with seeds:"+ sentence +" ")
   sys.stdout.write(generated)

   for i in range(1000):
     x_perd = np.zeros

