## Import all the libraries

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

In [40]:
# Reading the roo_data.csv file using pandas library
train_data = pd.read_csv('roo_data.csv')

In [41]:
# Printing top 20 rows of data
train_data.head(20)

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Portal Administrator
2,71,86,91,87,61,81,72,72,94,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Portal Administrator
3,76,87,60,84,89,73,62,88,69,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Systems Security Administrator
4,92,62,90,67,71,89,73,71,73,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Business Systems Analyst
5,88,86,62,79,93,84,69,71,82,11,...,Self help,salary,no,stubborn,Management,work,smart worker,yes,no,Software Systems Engineer
6,93,77,69,79,90,93,73,63,77,6,...,Drama,Work,yes,gentle,Technical,salary,hard worker,yes,yes,Database Developer
7,84,72,88,62,66,63,78,94,60,12,...,Romance,salary,no,stubborn,Technical,salary,smart worker,no,no,Business Intelligence Analyst
8,73,66,66,81,81,69,61,87,90,10,...,Math,Work,no,gentle,Technical,work,hard worker,yes,yes,Business Systems Analyst
9,62,76,85,91,82,69,63,63,81,10,...,Religion-Spirituality,Work,yes,gentle,Management,salary,hard worker,yes,no,CRM Technical Developer


In [42]:
df = train_data.copy()

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Acedamic percentage in Operating Systems  20000 non-null  int64 
 1   percentage in Algorithms                  20000 non-null  int64 
 2   Percentage in Programming Concepts        20000 non-null  int64 
 3   Percentage in Software Engineering        20000 non-null  int64 
 4   Percentage in Computer Networks           20000 non-null  int64 
 5   Percentage in Electronics Subjects        20000 non-null  int64 
 6   Percentage in Computer Architecture       20000 non-null  int64 
 7   Percentage in Mathematics                 20000 non-null  int64 
 8   Percentage in Communication skills        20000 non-null  int64 
 9   Hours working per day                     20000 non-null  int64 
 10  Logical quotient rating                   2000

## Data Preprocessing

In [5]:
df.columns

Index(['Acedamic percentage in Operating Systems', 'percentage in Algorithms',
       'Percentage in Programming Concepts',
       'Percentage in Software Engineering', 'Percentage in Computer Networks',
       'Percentage in Electronics Subjects',
       'Percentage in Computer Architecture', 'Percentage in Mathematics',
       'Percentage in Communication skills', 'Hours working per day',
       'Logical quotient rating', 'hackathons', 'coding skills rating',
       'public speaking points', 'can work long time before system?',
       'self-learning capability?', 'Extra-courses did', 'certifications',
       'workshops', 'talenttests taken?', 'olympiads',
       'reading and writing skills', 'memory capability score',
       'Interested subjects', 'interested career area ', 'Job/Higher Studies?',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Rea

In [6]:
df.rename(columns = {'Acedamic percentage in Operating Systems':'Acedamic percentage in ML', 
                     'percentage in Algorithms':'Percentage in AI',
                     'Percentage in Programming Concepts':'Percentage in GA',
                     'Percentage in Software Engineering':'Percentage in DMG',
                     'Percentage in Computer Networks':'Percentage in NLP',
                     'Percentage in Computer Architecture':'Percentage in DL',
                     'Percentage in Mathematics':'Percentage in LA',
                     'Percentage in Electronics Subjects':'Percentage in NSC',
                     'Percentage in Communication skills':'Percentage in PSOSM'},inplace = True)

In [8]:
#Printing columns name

df.columns

Index(['Acedamic percentage in ML', 'Percentage in AI', 'Percentage in GA',
       'Percentage in DMG', 'Percentage in NLP', 'Percentage in NSC',
       'Percentage in DL', 'Percentage in LA', 'Percentage in PSOSM',
       'Hours working per day', 'Logical quotient rating', 'hackathons',
       'coding skills rating', 'public speaking points',
       'can work long time before system?', 'self-learning capability?',
       'Extra-courses did', 'certifications', 'workshops',
       'talenttests taken?', 'olympiads', 'reading and writing skills',
       'memory capability score', 'Interested subjects',
       'interested career area ', 'Job/Higher Studies?',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Realtionship?', 'Gentle or Tuff behaviour?',
       'Management or Technical', 'Salary/work', 'hard/smart worker',
       'worked in teams ever?', 'I

In [9]:
# Converting Percentage values of the subject to Grades on a scale of 10 to 4
# Where 4 is the lowest grade and 10 is the highest grade

subject_columns = ['Acedamic percentage in ML', 'Percentage in AI', 'Percentage in GA', 'Percentage in DMG', 'Percentage in NLP','Percentage in DL',               
'Percentage in LA','Percentage in NSC','Percentage in PSOSM']

for i in subject_columns:
    df.loc[(df[i] < 40), i] = 4
    df.loc[(df[i] >= 40) & (df[i] < 50), i] = 5
    df.loc[(df[i] >= 50) & (df[i] < 60), i] = 6
    df.loc[(df[i] >= 60) & (df[i] < 70), i] = 7
    df.loc[(df[i] >= 70) & (df[i] < 80), i] = 8
    df.loc[(df[i] >= 80) & (df[i] < 90), i] = 9
    df.loc[(df[i] >= 90), i] = 10    
    

In [10]:
# Converting Logical quotient rating column to Low, Medium, High

df.loc[df['Logical quotient rating'] < 4, 'Logical quotient rating'] = 1
df.loc[(df['Logical quotient rating'] >= 4) & (df['Logical quotient rating'] < 8), 'Logical quotient rating'] = 2
df.loc[df['Logical quotient rating'] >= 8, 'Logical quotient rating'] = 3

In [11]:
df['Type of company want to settle in?'].unique()

array(['Web Services', 'SAaS services', 'Sales and Marketing',
       'Testing and Maintainance Services', 'product development', 'BPA',
       'Service Based', 'Product based', 'Cloud Services', 'Finance'],
      dtype=object)

In [47]:
# Converting column values into three types of categories for attribute Type of company want to settle in?

job_categories = {'Product Based':['product development', 'BPA','Product based'],
                 'Service Based':['Web Services','SAaS services','Testing and Maintainance Services','Service Based','Cloud Services'],
                 'Sales Based':['Sales and Marketing','Finance']}

for jobs in job_categories:
    df.loc[df['Type of company want to settle in?'].isin(job_categories[jobs]),'Type of company want to settle in?'] = jobs

In [48]:
df['Type of company want to settle in?'].unique()

array(['Service Based', 'Sales Based', 'Product Based'], dtype=object)

In [14]:
train_data['Suggested Job Role'].unique()

array(['Database Developer', 'Portal Administrator',
       'Systems Security Administrator', 'Business Systems Analyst',
       'Software Systems Engineer', 'Business Intelligence Analyst',
       'CRM Technical Developer', 'Mobile Applications Developer',
       'UX Designer', 'Quality Assurance Associate', 'Web Developer',
       'Information Security Analyst', 'CRM Business Analyst',
       'Technical Support', 'Project Manager',
       'Information Technology Manager', 'Programmer Analyst',
       'Design & UX', 'Solutions Architect', 'Systems Analyst',
       'Network Security Administrator', 'Data Architect',
       'Software Developer', 'E-Commerce Analyst',
       'Technical Services/Help Desk/Tech Support',
       'Information Technology Auditor', 'Database Manager',
       'Applications Developer', 'Database Administrator',
       'Network Engineer', 'Software Engineer', 'Technical Engineer',
       'Network Security Engineer',
       'Software Quality Assurance (QA) / Testi

In [15]:
# Converting Suggested Job Role column into 7 categories

job_role = {'Software Developer':['Database Developer', 'CRM Technical Developer', 'Mobile Applications Developer', 'Web Developer', 'Software Developer', 'Applications Developer'],
            'Engineer':['Software Engineer', 'Software Systems Engineer', 'Technical Engineer', 'Network Security Engineer', 'Network Engineer',  'Quality Assurance Associate'],
            'Designer':['Solutions Architect','Data Architect', 'Design & UX', 'UX Designer'],
            'Manager':['Information Technology Auditor', 'Information Technology Manager', 'Project Manager', 'Database Manager'],
            'Analyst':['Business Systems Analyst', 'CRM Business Analyst', 'Information Security Analyst',  'Programmer Analyst', 'E-Commerce Analyst', 'Business Intelligence Analyst', 'Systems Analyst'],
           'Administrator':[ 'Systems Security Administrator', 'Portal Administrator', 'Database Administrator',  'Network Security Administrator'],
           'Site Reliability and Testing':['Software Quality Assurance (QA) / Testing', 'Technical Support', 'Technical Services/Help Desk/Tech Support'] }

for jobs in job_role:
    df.loc[df['Suggested Job Role'].isin(job_role[jobs]), 'Suggested Job Role'] = jobs
    

In [16]:
df['Suggested Job Role'].unique()

array(['Software Developer', 'Administrator', 'Analyst', 'Engineer',
       'Designer/Architect', 'Technical Support/ Testing', 'Manager'],
      dtype=object)

In [17]:
df.head()

Unnamed: 0,Acedamic percentage in ML,Percentage in AI,Percentage in GA,Percentage in DMG,Percentage in NLP,Percentage in NSC,Percentage in DL,Percentage in LA,Percentage in PSOSM,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,7,7,8,9,10,10,9,9,7,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Software Developer
1,8,7,8,7,8,8,8,9,10,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Administrator
2,8,9,10,9,7,9,8,8,10,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Administrator
3,8,9,7,9,9,8,7,9,7,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Administrator
4,10,7,10,7,8,9,8,8,8,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Analyst


## Label Encoding of Categorical Column

In [18]:
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['can work long time before system?',
 'self-learning capability?',
 'Extra-courses did',
 'certifications',
 'workshops',
 'talenttests taken?',
 'olympiads',
 'reading and writing skills',
 'memory capability score',
 'Interested subjects',
 'interested career area ',
 'Job/Higher Studies?',
 'Type of company want to settle in?',
 'Taken inputs from seniors or elders',
 'interested in games',
 'Interested Type of Books',
 'Salary Range Expected',
 'In a Realtionship?',
 'Gentle or Tuff behaviour?',
 'Management or Technical',
 'Salary/work',
 'hard/smart worker',
 'worked in teams ever?',
 'Introvert',
 'Suggested Job Role']

In [19]:
# Converting All Categorical columns to numerical for model training using Label Encoder from SK Learn
for cols in categorical_cols:
    labelencod_x = LabelEncoder()
    df[cols] = labelencod_x.fit_transform(df[cols])

In [20]:
X_feature = df.iloc[:,:-1].values  
y_labels = df.iloc[:,-1].values

In [21]:
X_feature.shape

(20000, 38)

In [22]:
y_labels.shape

(20000,)

In [23]:
from keras.utils import np_utils

encoded_y = np_utils.to_categorical(y_labels)

In [24]:
encoded_y.shape

(20000, 7)

In [25]:
# Different size of train test splits

split_sizes = [0.1, 0.3, 0.4]

x_train_list=[]
x_test_list=[]
y_train_list=[]
y_test_list=[]

for i in range(len(split_sizes)):
    x_train, x_test, y_train, y_test = train_test_split(X_feature, encoded_y, test_size = split_sizes[i], random_state=42)  

    x_train_list.append(x_train)
    x_test_list.append(x_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

In [26]:
len(x_train)

12000

In [27]:
x_train_list[i].shape

(12000, 38)

In [28]:
# Standardizing the data so that mean value is 0 and standard variation is 1
for i in range(len(split_sizes)):
    sc = StandardScaler()  
    x_train_list[i]= sc.fit_transform(x_train_list[i])  
    x_test_list[i]= sc.transform(x_test_list[i]) 

## ANN Model

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [30]:
#Building ANN by defining the keras model

model = Sequential()
model.add(Dense(12, input_shape = (38,), activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(7, activation = 'softmax'))


In [31]:
# compiling the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
#printing the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                468       
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 7)                 63        
                                                                 
Total params: 635
Trainable params: 635
Non-trainable params: 0
_________________________________________________________________


In [34]:
#Applying ANN Model with 2 hidden layer of 5 and 4 nodes respectively 
y_preds = []
for i in range(len(split_sizes)):
    model.fit(x_train_list[i], y_train_list[i], epochs = 10)
    y_preds.append(model.predict(x_test_list[i]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
y_preds[0]

array([[0.18996835, 0.1830845 , 0.11127157, ..., 0.1889208 , 0.09708864,
        0.06635383],
       [0.17546675, 0.18558313, 0.11211247, ..., 0.18535295, 0.10536599,
        0.07229397],
       [0.14867167, 0.17146705, 0.1193378 , ..., 0.16035706, 0.12552023,
        0.09243604],
       ...,
       [0.12732354, 0.20736559, 0.10567519, ..., 0.18712635, 0.11438012,
        0.08965   ],
       [0.16477315, 0.19326638, 0.11082959, ..., 0.18795216, 0.11006291,
        0.0747167 ],
       [0.1415082 , 0.19371936, 0.10688878, ..., 0.17683409, 0.12399197,
        0.08720057]], dtype=float32)

In [None]:
from sklearn.metrics import classification_report


In [38]:
# Calculating and showing the confusion matrix for each train-test split

In [46]:
# Pritning results
for i in range(len(split_sizes)):
    rounded_labels = np.argmax(y_test_list[i], axis=1)
    rounded_pred = np.argmax(y_preds[i], axis=1)    
    cm = confusion_matrix(rounded_labels, rounded_pred)
    print(classification_report(rounded_labels, rounded_pred))
    print("Confusion Matrix :")
    print(cm)

    # Printing class wise accuracies

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    print(cm.diagonal())


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       288
           1       0.19      0.59      0.28       382
           2       0.00      0.00      0.00       229
           3       0.17      0.34      0.23       335
           4       0.00      0.00      0.00       235
           5       0.15      0.05      0.07       345
           6       0.00      0.00      0.00       186

    accuracy                           0.18      2000
   macro avg       0.07      0.14      0.08      2000
weighted avg       0.09      0.18      0.11      2000

Confusion Matrix :
[[  0 173   0 101   2  12   0]
 [  0 226   0 126   4  26   0]
 [  0 135   0  80   3  11   0]
 [  0 199   0 114   2  20   0]
 [  0 146   0  74   0  15   0]
 [  0 219   0 106   3  17   0]
 [  0 115   0  56   5  10   0]]
[0.         0.59162304 0.         0.34029851 0.         0.04927536
 0.        ]
              precision    recall  f1-score   support

           0       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
