In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,accuracy_score
pd.set_option('future.no_silent_downcasting', True)

In [13]:
data = pd.read_csv('datasets/prediction-data.csv')

In [14]:
## Shows the first 5 rows of the dataframe
data.head()

Unnamed: 0,Logical quotient rating,hackathons,coding skills rating,public speaking points,self-learning capability?,Extra-courses did,certifications,workshops,reading and writing skills,memory capability score,Interested subjects,interested career area,Type of company want to settle in?,Taken inputs from seniors or elders,Interested Type of Books,Management or Technical,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,5,0,6,2,yes,no,information security,testing,poor,poor,programming,testing,BPA,no,Series,Management,smart worker,yes,no,Applications Developer
1,7,6,4,3,no,yes,shell programming,testing,excellent,medium,Management,system developer,Cloud Services,yes,Autobiographies,Technical,hard worker,no,yes,Applications Developer
2,2,3,9,1,no,yes,information security,testing,excellent,poor,data engineering,Business process analyst,product development,yes,Travel,Technical,smart worker,no,no,Applications Developer
3,2,6,3,5,no,yes,r programming,database security,excellent,poor,networks,testing,Testing and Maintainance Services,yes,Guide,Management,smart worker,yes,yes,Applications Developer
4,2,0,3,4,yes,no,distro making,game development,excellent,medium,Software Engineering,system developer,BPA,no,Health,Technical,hard worker,yes,no,Applications Developer


In [6]:
## Number of rows and columns
data.shape

(6901, 20)

In [7]:
## Getting some informations about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6901 entries, 0 to 6900
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Logical quotient rating              6901 non-null   int64 
 1   hackathons                           6901 non-null   int64 
 2   coding skills rating                 6901 non-null   int64 
 3   public speaking points               6901 non-null   int64 
 4   self-learning capability?            6901 non-null   object
 5   Extra-courses did                    6901 non-null   object
 6   certifications                       6901 non-null   object
 7   workshops                            6901 non-null   object
 8   reading and writing skills           6901 non-null   object
 9   memory capability score              6901 non-null   object
 10  Interested subjects                  6901 non-null   object
 11  interested career area               6901 n

In [8]:
## Checking for missing values
data.isnull().sum(axis=0)

Logical quotient rating                0
hackathons                             0
coding skills rating                   0
public speaking points                 0
self-learning capability?              0
Extra-courses did                      0
certifications                         0
workshops                              0
reading and writing skills             0
memory capability score                0
Interested subjects                    0
interested career area                 0
Type of company want to settle in?     0
Taken inputs from seniors or elders    0
Interested Type of Books               0
Management or Technical                0
hard/smart worker                      0
worked in teams ever?                  0
Introvert                              0
Suggested Job Role                     0
dtype: int64

No missing values in the dataset

In [9]:
## statistical Measures of the dataset
data.describe()

Unnamed: 0,Logical quotient rating,hackathons,coding skills rating,public speaking points
count,6901.0,6901.0,6901.0,6901.0
mean,4.991016,2.99971,5.010723,4.988263
std,2.577704,2.010191,2.568347,2.5995
min,1.0,0.0,1.0,1.0
25%,3.0,1.0,3.0,3.0
50%,5.0,3.0,5.0,5.0
75%,7.0,5.0,7.0,7.0
max,9.0,6.0,9.0,9.0


In [10]:
print("List of Numerical features: \n" , data.select_dtypes(include=np.number).columns.tolist())
print("\n\nList of Categorical features: \n" , data.select_dtypes(include=['object']).columns.tolist())

List of Numerical features: 
 ['Logical quotient rating', 'hackathons', 'coding skills rating', 'public speaking points']


List of Categorical features: 
 ['self-learning capability?', 'Extra-courses did', 'certifications', 'workshops', 'reading and writing skills', 'memory capability score', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 'Taken inputs from seniors or elders', 'Interested Type of Books', 'Management or Technical', 'hard/smart worker', 'worked in teams ever?', 'Introvert', 'Suggested Job Role']


### Visualization for Categorical Variables

In [10]:
print(data["Logical quotient rating"].value_counts())

Logical quotient rating
6    799
9    784
2    782
5    773
3    772
4    759
1    756
7    752
8    724
Name: count, dtype: int64


In [11]:
print(data["hackathons"].value_counts())

hackathons
5    1033
2    1026
0    1010
6     989
3     966
1     952
4     925
Name: count, dtype: int64


In [12]:
print(data["coding skills rating"].value_counts())

coding skills rating
4    787
5    777
2    776
6    774
8    767
7    766
9    761
3    755
1    738
Name: count, dtype: int64


In [13]:
print(data["public speaking points"].value_counts())

public speaking points
7    807
1    799
8    777
2    770
3    766
4    760
9    758
6    740
5    724
Name: count, dtype: int64


In [14]:
print(data["interested career area "].value_counts())

interested career area 
system developer            1178
security                    1177
Business process analyst    1154
developer                   1145
testing                     1128
cloud computing             1119
Name: count, dtype: int64


In [15]:
print(data['Suggested Job Role'].value_counts())

Suggested Job Role
Network Security Engineer                    630
Software Engineer                            590
UX Designer                                  589
Software Developer                           587
Database Developer                           581
Software Quality Assurance (QA) / Testing    571
Web Developer                                570
CRM Technical Developer                      567
Technical Support                            565
Systems Security Administrator               562
Applications Developer                       551
Mobile Applications Developer                538
Name: count, dtype: int64


In [16]:
print(data.columns)

Index(['Logical quotient rating', 'hackathons', 'coding skills rating',
       'public speaking points', 'self-learning capability?',
       'Extra-courses did', 'certifications', 'workshops',
       'reading and writing skills', 'memory capability score',
       'Interested subjects', 'interested career area ',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'Interested Type of Books',
       'Management or Technical', 'hard/smart worker', 'worked in teams ever?',
       'Introvert', 'Suggested Job Role'],
      dtype='object')


# **Feature Engineering**

### Encoding the categorical features

In [15]:
## encoding "self-learning capability?" column.
data.replace({'self-learning capability?':{'yes':1,'no':0}},inplace=True)



## encoding "Extra-courses did" column.
data.replace({'Extra-courses did':{'yes':1,'no':0}},inplace=True)



## encoding "certifications" column.
data.replace({'certifications':{'r programming' : 0,
'information security': 1,
'shell programming' : 2,
'machine learning' : 3,
'full stack': 4,
'hadoop' : 5,
'python' : 6,
'distro making' : 7,
'app development' : 8
}},inplace=True)



## encoding "workshops" column.
data.replace({'workshops':{'database security' :0,
'system designing' : 1,
'web technologies' : 2,
'hacking' : 3,
'testing' : 4,
'data science' : 5,
'game development' : 6,
'cloud computing' : 7
}},inplace=True)



## encoding "reading and writing skills" column.
data.replace({'reading and writing skills':{'poor':0,'medium':1, 'excellent':2}},inplace=True)



## encoding "memory capability score" column.
data.replace({'memory capability score':{'poor':0,'medium':1, 'excellent':2}},inplace=True)



## encoding "Interested subjects" column.
data.replace({'Interested subjects':{'Software Engineering': 0, 'IOT': 1, 'cloud computing': 2, 'programming': 3, 'networks': 4,
    'Computer Architecture': 5, 'data engineering': 6, 'hacking': 7, 'Management': 8, 'parallel computing': 9}},inplace=True)



## encoding "interested career area" column.
data.replace({'interested career area ':{'system developer': 0, 'security': 1, 'Business process analyst': 2, 'developer': 3, 'testing': 4,
    'cloud computing': 5}},inplace=True)



## encoding "Type of company want to settle in?" column.
data.replace({'Type of company want to settle in?':{'Service Based': 0, 'Web Services': 1, 'BPA': 2, 'Testing and Maintainance Services': 3,
    'Product based': 4, 'Finance': 5, 'Cloud Services': 6, 'product development': 7,
    'Sales and Marketing': 8, 'SAaS services': 9}},inplace=True)



## encoding "Taken inputs from seniors or elders" column.
data.replace({'Taken inputs from seniors or elders':{'yes': 1, 'no': 0}},inplace=True)



## encoding "Interested Type of Books" column.
data.replace({'Interested Type of Books':{'Guide': 0, 'Health': 1, 'Self help': 2, 'Horror': 3, 'Biographies': 4, 'Science fiction': 5,
    'Satire': 6, 'Childrens': 7, 'Autobiographies': 8, 'Prayer books': 9, 'Fantasy': 10, 'Journals': 11,
    'Trilogy': 12, 'Anthology': 13, 'Encyclopedias': 14, 'Drama': 15, 'Mystery': 16, 'History': 17,
    'Science': 18, 'Dictionaries': 19, 'Diaries': 20, 'Religion-Spirituality': 21, 'Action and Adventure': 22,
    'Poetry': 23, 'Cookbooks': 24, 'Comics': 25, 'Art': 26, 'Travel': 27, 'Series': 28, 'Math': 29, 'Romance': 30}},inplace=True)



## encoding "Management or Technical" column.
data.replace({'Management or Technical':{'Management': 0, 'Technical': 1}},inplace=True)



## encoding "hard/smart worker" column.
data.replace({'hard/smart worker':{'hard worker': 0, 'smart worker': 1}},inplace=True)



## encoding "worked in teams ever?" column.
data.replace({'worked in teams ever?':{'yes': 1, 'no': 0}},inplace=True)



## encoding "Introvert" column.
data.replace({'Introvert':{'yes': 1, 'no': 0}},inplace=True)


In [16]:
X = data.drop(columns='Suggested Job Role', axis=1)
X = pd.DataFrame(X)
Y = data['Suggested Job Role']

### Head Values

In [17]:
data.head()

Unnamed: 0,Logical quotient rating,hackathons,coding skills rating,public speaking points,self-learning capability?,Extra-courses did,certifications,workshops,reading and writing skills,memory capability score,Interested subjects,interested career area,Type of company want to settle in?,Taken inputs from seniors or elders,Interested Type of Books,Management or Technical,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,5,0,6,2,1,0,1,4,0,0,3,4,2,0,28,0,1,1,0,Applications Developer
1,7,6,4,3,0,1,2,4,2,1,8,0,6,1,8,1,0,0,1,Applications Developer
2,2,3,9,1,0,1,1,4,2,0,6,2,7,1,27,1,1,0,0,Applications Developer
3,2,6,3,5,0,1,0,0,2,0,4,4,3,1,0,0,1,1,1,Applications Developer
4,2,0,3,4,1,0,7,6,2,1,0,0,2,0,1,1,0,1,0,Applications Developer


### Model Training


In [18]:
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)
X_test = pd.DataFrame(X_test, columns=X.columns) 



(6901, 19) (5520, 19) (1381, 19)


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Decision Tree

In [29]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(random_state=1)
dtree.fit(X_train,Y_train)
Y_pred = dtree.predict(X_test)

print("Precision of Decision Tree Classifier: ",precision_score(Y_test, Y_pred, average='weighted'))
print("  ")

print("F1-Score of Decision Tree Classifier: ",f1_score(Y_test, Y_pred, average='weighted'))
print("  ")

accuracy = accuracy_score(Y_test,Y_pred)
print("Accuracy Score of Decision Tree Classifier: ",accuracy*10)
print("  ")

cm = confusion_matrix(Y_test,Y_pred)
print("confusion matrics = ")
print(cm)



Precision of Decision Tree Classifier:  0.08599760123093472
  
F1-Score of Decision Tree Classifier:  0.08577204602608891
  
Accuracy Score of Decision Tree Classifier:  0.8616944243301955
  
confusion matrics = 
[[10  8  7 16 10 10  7  9  6 11  5 10]
 [ 8  9  5 11 10 14 11  8  7  7 10  5]
 [ 9  9 12 12 10 23 10 11 13  7 14  9]
 [ 9  6  7  9 10  6 13  6 14 10  8  7]
 [ 8 11 10  6 14  9  8 12 11 13 15  8]
 [ 9 11 12 11 17 10 12  8  4  9 10 12]
 [ 9 10 12 10  7  7 16  4 10  7 13  8]
 [ 7 11 11 11  9  9  8  5 11 10  8 11]
 [ 9  6  9  4 10 12  9 10 11 11  6 10]
 [ 6 10 13  7 10  6 10  4  8  8  6 10]
 [ 6 19  7  7  9  9 17 14  9  9  7 11]
 [10 15  7 14 11  7  8 12 12  8  8  8]]


In [32]:
userdata = [['5','0','6','2','1','0','1','4','0','0','3','4','2','0','28','0','1','1','0']]
userdata_df = pd.DataFrame(userdata, columns=X.columns)
ynewclass = dtree.predict(userdata_df)
ynew = dtree.predict_proba(userdata_df)
print("Prediction :",ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

Prediction : ['Applications Developer']
Probabilities of all classes:  [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Probability of Predicted class :  1.0


### SVM

In [33]:
from sklearn import svm

svm = svm.SVC()
svm.fit(X_train, Y_train)

svm_y_pred = svm.predict(X_test)

print("Precision of SVM Classifier: ",precision_score(Y_test, svm_y_pred, average='weighted'))
print("  ")

print("F1-Score of SVM Classifier: ",f1_score(Y_test, svm_y_pred, average='weighted'))
print("  ")

svm_accuracy = accuracy_score(Y_test,svm_y_pred)
print("Accuracy Score of SVM Classifier: ",svm_accuracy*10)
print("  ")


svm_cm = confusion_matrix(Y_test,svm_y_pred)

print("confusion matrics : ")
print(svm_cm)



Precision of SVM Classifier:  0.0827693948055859
  
F1-Score of SVM Classifier:  0.07217665501883468
  
Accuracy Score of SVM Classifier:  0.8761766835626359
  
confusion matrics : 
[[ 4  3  5  0 32 16  8 18  5 10  6  2]
 [ 7  7  6  0 29 15  9 16  1 11  3  1]
 [ 5  3  3  0 44 14 16 14 16 14 10  0]
 [ 6  4  1  0 25 20 12 18  5  7  5  2]
 [ 7  9  2  0 42 15 10  9 11 11  7  2]
 [ 5  4  4  0 45 11 10 17 11  7 10  1]
 [ 5  7  0  0 31 17 10 16  9  9  9  0]
 [ 3  3  4  0 36 16  9 14  7 11  8  0]
 [ 3  5  5  1 29 17  8 13  9  8  7  2]
 [ 2  6  1  0 25 15 14 14  4 11  6  0]
 [ 3  7  2  1 36 14 11 17  8 14  8  3]
 [11  5  6  0 34 12 12 19  7  8  4  2]]


In [34]:
userdata = [['5','0','6','2','1','0','1','4','0','0','3','4','2','0','28','0','1','1','0']]
userdata_df = pd.DataFrame(userdata, columns=X.columns)
ynewclass = svm.predict(userdata_df)
ynew = svm.decision_function(userdata_df)
print("Prediction :",ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

Prediction : ['Network Security Engineer']
Probabilities of all classes:  [[ 5.11034706  3.82581666  6.90554591  3.7762976  11.30112683  2.80745029
   6.84284656  0.73821292  6.03489468  6.96231119 10.26766783  0.72770168]]
Probability of Predicted class :  11.30112682703811


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 10)
rf.fit(X_train, Y_train)
rfc_y_pred = rf.predict(X_test)

print("Precision of RandomForest Classifier: ",precision_score(Y_test, rfc_y_pred, average='weighted'))
print("  ")

print("F1-Score of RandomForest Classifier: ",f1_score(Y_test, rfc_y_pred, average='weighted'))
print("  ")

rfc_accuracy = accuracy_score(Y_test,rfc_y_pred)
print("Accuracy of RandomForest Classifier: ",rfc_accuracy*10)
print("  ")

rfc_cm = confusion_matrix(Y_test,rfc_y_pred)

print("confusion matrics=")
print(rfc_cm)

In [46]:
userdata = [['5','0','6','2','1','0','1','4','0','0','3','4','2','0','28','0','1','1','0']]
userdata_df = pd.DataFrame(userdata, columns=X.columns)
ynewclass = rf.predict(userdata_df)
ynew = rf.predict_proba(userdata_df)
print("Prediction :",ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

Prediction : ['Applications Developer']
Probabilities of all classes:  [[0.68 0.07 0.   0.03 0.06 0.02 0.03 0.01 0.02 0.03 0.02 0.03]]
Probability of Predicted class :  0.68




### Create a .pkl file using Decision Tree

In [None]:
import pickle
pickle.dump(dtree,open('dtmodel.pkl','wb'))

### Create a .pkl file using Random Forest

In [None]:
import pickle
pickle.dump(rf,open('rfmodel.pkl','wb'))

In [None]:
##print(sklearn.__version__)
##print(pd.__version__)