<a href="https://colab.research.google.com/github/johnanisere/promotion_prediction/blob/master/Promotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from keras.utils import normalize
import matplotlib.pyplot as plt
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Dropout
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Downloading CSV 
csv_file_url = "https://firebasestorage.googleapis.com/v0/b/titor-staging.appspot.com/o/train.csv?alt=media&token=60305f3e-2273-4e5e-b5f5-61242abfa0a5"
csv_file = tf.keras.utils.get_file('data.csv', csv_file_url)

# Downloading test CSV 
test_csv_file_url = "https://firebasestorage.googleapis.com/v0/b/titor-staging.appspot.com/o/test.csv?alt=media&token=203cae0f-82d6-4fb7-9e5c-a5a9c23293fe"
test_csv_file = tf.keras.utils.get_file('test_data.csv', test_csv_file_url)

Using TensorFlow backend.


Downloading data from https://firebasestorage.googleapis.com/v0/b/titor-staging.appspot.com/o/train.csv?alt=media&token=60305f3e-2273-4e5e-b5f5-61242abfa0a5
Downloading data from https://firebasestorage.googleapis.com/v0/b/titor-staging.appspot.com/o/test.csv?alt=media&token=203cae0f-82d6-4fb7-9e5c-a5a9c23293fe


In [0]:
# Importing the dataset and removing irrelevant data
dataset = pd.read_csv(csv_file).drop(['Gender','Channel_of_Recruitment','Year_of_birth','State_Of_Origin','Marital_Status','EmployeeNo'], axis=1)
X = dataset.iloc[:, 0:12].values
y = dataset.iloc[:, -1].values

test_dataset = pd.read_csv(test_csv_file).drop(['Gender','Channel_of_Recruitment','Year_of_birth','State_Of_Origin','Marital_Status','EmployeeNo'], axis=1)
test_X = test_dataset.iloc[:, :].values

In [0]:
# Taking care of missing data

imputer = SimpleImputer(strategy='most_frequent')
imputer = imputer.fit(X[:, 1:2])
X[:, 1:2]=imputer.transform(X[:, 1:2])

test_imputer = SimpleImputer(strategy='most_frequent')
test_imputer = test_imputer.fit(test_X[:, 1:2])
test_X[:, 1:2]=test_imputer.transform(test_X[:, 1:2])


In [0]:
# Encoding categorical data

X[:, 0] = LabelEncoder().fit_transform(X[:, 0])
X[:, 1] = LabelEncoder().fit_transform(X[:, 1])
X[:, 8] = LabelEncoder().fit_transform(X[:, 8])
X[:, 9] = LabelEncoder().fit_transform(X[:, 9])
X[:, 10] = LabelEncoder().fit_transform(X[:, 10])
X[:,-1] =  LabelEncoder().fit_transform(X[:, -1])
onehotencoder = OneHotEncoder(handle_unknown='ignore')
transformed = onehotencoder.fit_transform(X[:, :2]).toarray()
X = np.concatenate([transformed, X[:, 3:]], axis=1)


test_X[:, 0] = LabelEncoder().fit_transform(test_X[:, 0])
test_X[:, 1] = LabelEncoder().fit_transform(test_X[:, 1])
test_X[:, 8] = LabelEncoder().fit_transform(test_X[:, 8])
test_X[:, 9] = LabelEncoder().fit_transform(test_X[:, 9])
test_X[:, 10] = LabelEncoder().fit_transform(test_X[:, 10])
test_X[:,-1] =  LabelEncoder().fit_transform(test_X[:, -1])
onehotencoder = OneHotEncoder(handle_unknown='ignore')
transformed = onehotencoder.fit_transform(test_X[:, :2]).toarray()
test_X = np.concatenate([transformed, test_X[:, 3:]], axis=1)



In [0]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [0]:
# Feature Scaling

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_pred = sc.transform(test_X)


In [0]:


BATCH_SIZE=2
EPOCHS=100

# Initialising the ANN
classifier=Sequential()

# Adding the input layer 
classifier.add(Dense(activation='relu',units=11,kernel_initializer='uniform',input_dim=21))
classifier.add(Dropout(rate=1))

# # Adding the first hidden layer with dropout
classifier.add(Dense(activation='relu',units=11,kernel_initializer='uniform'))
classifier.add(Dropout(rate=1))


# Adding the output layer
classifier.add(Dense(activation='sigmoid',units=1,kernel_initializer='uniform'))

# Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

history = classifier.fit(X_train,y_train,validation_data=(X_test,y_test),batch_size=BATCH_SIZE,epochs=EPOCHS)




Train on 30649 samples, validate on 7663 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

In [0]:
acc = history.history['acc']
val_acc = history.history['val_acc']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(EPOCHS)

plt.figure(figsize=(24, 12))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.savefig('./foo.png')
plt.show()

In [0]:
X_pred = sc.transform(test_X)
y_pred = classifier.predict(X_pred)
x=0
prediction=[]
true=[int(1)]
false=[int(0)]
for pred in y_pred:
  if float(pred)*100>=49.5:
    prediction.insert(x,true)
    x+=1
  else:
    prediction.insert(x,false)
    x+=1

y_pred[:,:]=np.array(prediction,dtype=int)  
employee_no = pd.read_csv(test_csv_file).iloc[:, 0:1].values
result = np.concatenate([employee_no[:,:],np.array(prediction,dtype=int)], axis=1)

submission = pd.DataFrame(result,columns=["EmployeeNo","Promoted_or_Not"]).to_csv("final_submission.csv",index=False, index_label=False,)

from google.colab import files
files.download("final_submission.csv")
# files.download('foo.png')

