[View in Colaboratory](https://colab.research.google.com/github/khare19yash/Google_colab/blob/master/Untitled2.ipynb)

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
#Authenticate and create PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1jbeyGqPmjfVBMvgZ4cW4X92ykLutDJFM' in parents"}).GetList()

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(local_download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)

title: data_test.csv, id: 1tYEfju0YCYWAHguShJckG5ZpU_Ojg0f4
downloading to /root/data/data_test.csv
title: data_train.csv, id: 1odT96GUX7x_Dfvn-sFJdMLewnpPTLp6O
downloading to /root/data/data_train.csv
title: readme, id: 1VcpZb1V7ZnROtAbfXoFSLLsBpRcbansd
downloading to /root/data/readme


In [4]:
#read training data 
PATH = '/root/data/data_train.csv'

raw_train_data = pd.read_csv(PATH)
print(raw_train_data.head())


   id  num1  num2  num3  num4  num5  num6  num7  num8  num9   ...    cat6  \
0   0     2     5     0     1     0     0     0     0     0   ...     NaN   
1   1     1     7     0     0     1     0     0     0     0   ...     NaN   
2   2     5     9     0     0     1     0     0     0     0   ...     NaN   
3   3     0     2     1     0     0     0     0     0     0   ...     0.0   
4   4     0     0     1     0     0     0     0     0     0   ...     NaN   

   cat7  cat8  cat9  cat10  cat11  cat12  cat13  cat14  target  
0     0   1.0     4    1.0      0    0.0      1     12       0  
1     0   NaN    11    1.0      1    2.0      1     19       0  
2     0   NaN    14    1.0      1    2.0      1     60       0  
3     0   1.0    11    1.0      1    3.0      1    104       0  
4     0   NaN    14    1.0      1    2.0      1     82       0  

[5 rows x 58 columns]


In [5]:
#read test data
PATH = '/root/data/data_test.csv'

raw_test_data = pd.read_csv(PATH)
print(raw_test_data.shape)
#store test data id
test_id = raw_test_data['id'].values

(892816, 57)


In [6]:
#counting total na values in each column
total_na = raw_train_data.isna().sum()
print(total_na)

id             0
num1           0
num2           0
num3           0
num4           0
num5           0
num6           0
num7           0
num8           0
num9           0
num10          0
num11          0
num12          0
num13          0
num14          0
num15          0
num16          0
num17          0
num18     107909
num19          5
num20          1
num21          0
num22      42667
num23          0
der1           0
der2           0
der3           0
der4           0
der5           0
der6           0
der7           0
der8           0
der9           0
der10          0
der11          0
der12          0
der13          0
der14          0
der15          0
der16          0
der17          0
der18          0
der19          0
cat1         217
cat2          83
cat3        5814
cat4         107
cat5           5
cat6      411792
cat7           0
cat8      266928
cat9           0
cat10      11503
cat11          0
cat12        570
cat13          0
cat14          0
target         0
dtype: int64


**Data Preprocessing**



1.   Replace missing values in numerical variables by mean
2.   Replace missing values in categorical variables by creating new category.

1.  Convert all numerical values between 0 and  1

1.   Convert all categorical values into label encoding and one hot encoding
















In [0]:


def one_hot_encoding(data,column_names):
  
  data = pd.get_dummies(data , columns = column_names , prefix = column_names)
  return data  


def normalize_values(data,column_names):
  num_min = data[column_names].min()
  num_max = data[column_names].max()
  
  norm_val = num_max - num_min
  data[column_names] = (data[column_names] - num_min) / norm_val
  
  return data


def preprocess_data(data):
  
  column_names = list(data.columns.values)
  num_columns = column_names[1:24]
  der_numerical_columns = column_names[24:27]
  der_categorical_columns = column_names[27:43]
  cat_columns = column_names[43:]
  
  #handling missing values in numerical variables 
  data[num_columns] = data[num_columns].fillna(data[num_columns].mean())
  
  
  #drop some columns 
  drop_columns = ['cat6','cat8']
  
  cat_columns.remove('cat6')
  cat_columns.remove('cat8')
  
  data = data.drop(columns = drop_columns)
  
  #handling missing values in categorical variables 
  data[cat_columns] = data[cat_columns].fillna("NA")
  
  #label encoding 
  cat_datatype_columns = data.select_dtypes(['object']).columns
  
  for column in cat_datatype_columns:
    data[column] = data[column].astype('category')
    
  data[cat_datatype_columns] = data[cat_datatype_columns].apply(lambda x:x.cat.codes)
  
  #one hot encoding
  data = one_hot_encoding(data,der_categorical_columns)
  data = one_hot_encoding(data,cat_columns)
  
  
  #normalize numerical values
  data = normalize_values(data,num_columns)
  data = normalize_values(data,der_numerical_columns)
  
  return data

In [8]:
#seperate out the target values from the training data
target = raw_train_data['target'].values
raw_data = raw_train_data.drop(columns = ['target'])
print(raw_train_data.shape)

(596000, 58)


In [9]:
#preprocess raw training data
prep_train_data = preprocess_data(raw_data)
print(prep_train_data.shape)

(596000, 363)


In [10]:
#preprocess raw test data
prep_test_data = preprocess_data(raw_test_data)
print(prep_test_data.shape)

(892816, 367)


In [18]:
train_columns = prep_train_data.columns.values
test_columns = prep_test_data

s = set(test_columns) - set(train_columns)
print(s)

{'der11_20', 'der8_1', 'der14_28', 'der13_14', 'der12_11', 'der13_15'}


In [0]:
#Define training and validation size

n_training = 500000
n_validation = 96000
n_test = 892816


    
#Divide data into training set and validation set

# removing id column
def get_train_test_data(prep_train_data,target,prep_test_data):
  prep_train_data = prep_train_data.drop(columns = ['id'])
  prep_test_data = prep_test_data.drop(columns = ['id'])
  y_train = target
  x_train = prep_train_data.values
  N,M = x_train.shape
  indices = np.arange(N)
  np.random.shuffle(indices)
  x_train = x_train[indices]
  y_train = y_train[indices]


  x_val = x_train[:n_validation]
  y_val = y_train[:n_validation]
  val_set = (x_val,y_val)


  x_train = x_train[n_validation:n_validation+n_training]
  y_train = y_train[n_validation:n_validation+n_training]
  train_set = (x_train,y_train)
  
  print(x_train.shape)
  print(y_train.shape)
  print(x_val.shape)
  print(y_val.shape)
  test = prep_test_data.values
  return train_set,val_set,test
  

class Model():
  def __init__(self):
    self.lrate = 0.0001
    self.batch_size = 128 
    self.ntrain = 500000
    self.nclasses = 2
    self.ntest = 892816 
    
  def get_data(self,prep_train_data,target,prep_test_data):
    with tf.name_scope('data'):
      # Create dataset and iterator
      train,val,test = get_train_test_data(prep_train_data,target,prep_test_data)
      train_data = tf.data.Dataset.from_tensor_slices(train)
      train_data = train_data.shuffle(100000)
      train_data = train_data.batch(self.batch_size)

      val_data = tf.data.Dataset.from_tensor_slices(val)
      val_data = val_data.batch(self.batch_size)


      test_data = tf.data.Dataset.from_tensor_slices(test)
      test_data = test_data.batch(self.batch_size)

      iterator = tf.data.Iterator.from_structure(train_data.output_types,
                                                train_data.output_shapes)

      X , label = iterator.get_next()
      _,self.m = X.shape
      self.X = tf.cast(X,dtype=tf.float32)
      self.label = tf.one_hot(label,self.nclasses)

      self.train_init = iterator.make_initializer(train_data)
      self.val_init = iterator.make_initializer(val_data)
      self.test_init = iterator.make_initializer(test_data)

  def inference(self):
    with tf.variable_scope('logreg',reuse=tf.AUTO_REUSE) as scope:
      # Create weights and bias
      # w is initialized to random variables with mean 0 and stddev 0.01 
      # b is initialized to zero
      w = tf.get_variable(name='weights',dtype=tf.float32,shape=[self.m,self.nclasses],
                          initializer = tf.random_normal_initializer(0 , 0.01))
      b = tf.get_variable(name='bias',shape=[self.nclasses],
                         initializer = tf.zeros_initializer())

      # build model 
      # the model that returns logits 
      self.logits = tf.matmul(self.X,w) + b

  def create_loss(self):
    with tf.name_scope('loss'):
      # define loss function
      entropy = tf.nn.softmax_cross_entropy_with_logits(logits = self.logits , labels = self.label)
      self.loss = tf.reduce_mean(entropy,name='loss')
      
  def create_optimizer(self):
    with tf.name_scope('optimizer'):      
      # define training op 
      self.optimizer = tf.train.AdamOptimizer(self.lrate).minimize(self.loss)

  def eval_model(self):
    with tf.name_scope('eval'):      
      # calculate accuracy 
      self.preds = tf.nn.softmax(self.logits)
      self.predicted_labels = tf.argmax(self.preds,1)
      correct_preds = tf.equal(tf.argmax(self.preds,1),tf.argmax(self.label,1))
      self.accuracy = tf.reduce_sum(tf.cast(correct_preds,dtype=tf.float32))

  def build_model(self,prep_train_data,target):
    self.get_data(prep_train_data,target,prep_test_data)
    self.inference()
    self.create_loss()
    self.create_optimizer()
    self.eval_model()
    
  def train(self,n_epochs):
    # start training loop 
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
      sess.run(init)
      for epoch in range(n_epochs):
        sess.run(self.train_init)
        total_loss = 0.0
        n_batches = 0
        step = 0
        
        try:
          while True:
            _,batch_loss = sess.run([self.optimizer,self.loss])
            total_loss += batch_loss
            n_batches += 1
            step += 1
            if step%100 == 0:
              print('Step {} : Loss {}'.format(step,batch_loss))
        
        except tf.errors.OutOfRangeError:
          pass
        print('Average loss at epoch {} is {}'.format(epoch,total_loss/n_batches))
      
      #calculate validation set accuracy
      sess.run(self.val_init)
      total_acc = 0
      try:
        while True:
          acc,logits = sess.run(self.accuracy)
          total_acc += acc 
      except tf.errors.OutOfRangeError:
        pass
      print('Average Validation accuracy {}'.format(total_acc / 96000))
      
      #calculate test set predictions
      sess.run(self.test_init)
      predictions = []
      try:
        while True:
          test_pred = sess.run(self.predicted_labels)
          predictions.append(test_pred)
      except tf.errors.OutOfRangeError:
        pass
      
    return predictions
       
      




In [12]:
#Build Model
model = Model()
model.build_model(prep_train_data,target)


(500000, 362)
(500000,)
(96000, 362)
(96000,)


ValueError: ignored

In [10]:
#train model for 10 epochs
predictions = model.train(10)

Step 100 : Loss 0.42489755153656006
Step 200 : Loss 0.30327871441841125
Step 300 : Loss 0.2568695843219757
Step 400 : Loss 0.19157129526138306
Step 500 : Loss 0.15567736327648163
Step 600 : Loss 0.16196492314338684
Step 700 : Loss 0.15422973036766052
Step 800 : Loss 0.149203360080719
Step 900 : Loss 0.10402834415435791
Step 1000 : Loss 0.1669762283563614
Step 1100 : Loss 0.09747090190649033
Step 1200 : Loss 0.14110711216926575
Step 1300 : Loss 0.18918120861053467
Step 1400 : Loss 0.11476323008537292
Step 1500 : Loss 0.11519405245780945
Step 1600 : Loss 0.18558967113494873
Step 1700 : Loss 0.09138542413711548
Step 1800 : Loss 0.1651363968849182
Step 1900 : Loss 0.18966278433799744
Step 2000 : Loss 0.10729599744081497
Step 2100 : Loss 0.23808132112026215
Step 2200 : Loss 0.1375671774148941
Step 2300 : Loss 0.1386905312538147
Step 2400 : Loss 0.18598881363868713
Step 2500 : Loss 0.1924814134836197
Step 2600 : Loss 0.08814670145511627
Step 2700 : Loss 0.18665307760238647
Step 2800 : Loss 0

In [11]:
predictions = [logits for sub_list in pred_logits for logits in sub_list]
print(len(pred_logits))

96000


In [12]:
pred_logits[0]

array([0.95937926, 0.04062074], dtype=float32)