In [12]:
# **************************** IMPORTANT ****************************
'''
This cell configuration settings for the Notebook. 
You can run one role at a time to evaluate the performance of the model
Change the variable names to run for multiple roles

In this model - cosine distance is calculated between the skills and
the course description. Each skill has a weighted score based on the 
popularity of the skill. This is derived by endorsements of the respective
skill by other linkedin connections.
'''


# *******************************************************************
# For each role a list of category names are grouped. 
# Please don't change these variables

label_data_scientist = ['Data Science','Machine Learning',
                           'Data Analysis', 'Business Intelligence',
                           'Data Mining','Data Visualization']

label_software_engineer = ['Software Development','Computer Science',
                           'Programming Languages', 'Software Development',
                           'Web Development','Algorithms and Data Structures']
                                       
# *******************************************************************


# *******************************************************************
# Environment and Config Variables. Change these variables as required.

my_fpath_courses = "../Data/main_coursera.csv"

my_fpath_skills_DataScientist = "../Data/Word2Vec-Google/Word2VecGoogle_DataScientist.csv"

my_fpath_skills_SoftwareEngineer = "../Data/Word2Vec-Google/Word2VecGoogle_DataScientist.csv" 
                                       
# *******************************************************************


In [13]:
# Importing required modules/packages

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk, string


In [14]:
# Downloading the stopwords like i, me, and, is, the etc.

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/DV/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Loading courses and skills data from the CSV files

df_courses = pd.read_csv(my_fpath_courses)

df_DataScientist = pd.read_csv(my_fpath_skills_DataScientist)
df_DataScientist = df_DataScientist.drop('Role', 1)
df_DataScientist.columns = ['Course Id', 'DataScientist_Skill_Score', 'DataScientist_Role_Score']

df_SoftwareEngineer = pd.read_csv(my_fpath_skills_SoftwareEngineer)
df_SoftwareEngineer = df_SoftwareEngineer.drop('Role', 1)
df_SoftwareEngineer.columns = ['Course Id','SoftwareEngineer_Skill_Score', 'SoftwareEngineer_Role_Score']


In [17]:
# Merging the csv files

df_cosdist = df_DataScientist.merge(df_SoftwareEngineer, on = 'Course Id', how = 'outer')


In [18]:
# Exploring data dimensionality, feature names, and feature types.

print(df_courses.shape,"\n")

print(df_cosdist.shape,"\n")

print(df_courses.columns, "\n")

print(df_cosdist.shape,"\n")

print(df_courses.describe(), "\n")

print(df_cosdist.describe(), "\n")


(2213, 18) 

(2213, 5) 

Index(['Unnamed: 0', 'Course Id', 'Course Name', 'Course Description', 'Slug',
       'Provider', 'Universities/Institutions', 'Parent Subject',
       'Child Subject', 'Category', 'Url', 'Length', 'Language',
       'Credential Name', 'Rating', 'Number of Ratings', 'Certificate',
       'Workload'],
      dtype='object') 

(2213, 5) 

        Unnamed: 0    Course Id      Length       Rating  Number of Ratings  \
count  2213.000000  2213.000000  964.000000  2213.000000        2213.000000   
mean   1430.685043  4816.998192    6.063278     2.352785          10.321735   
std     887.770407  3033.878865    2.724669     2.129134         110.680382   
min       0.000000   303.000000    1.000000     0.000000           0.000000   
25%     631.000000  1829.000000    4.000000     0.000000           0.000000   
50%    1455.000000  4880.000000    6.000000     3.000000           1.000000   
75%    2216.000000  7329.000000    7.000000     4.428571           4.000000   
max  

In [32]:
# Quick check to see if the dataframe showing the right results

df_cosdist.head(20)

Unnamed: 0,Course Id,DataScientist_Skill_Score,DataScientist_Role_Score,SoftwareEngineer_Skill_Score,SoftwareEngineer_Role_Score
0,303,0.250289,0.268095,0.250289,0.268095
1,305,0.237911,0.205215,0.237911,0.205215
2,306,0.191053,0.124907,0.191053,0.124907
3,307,0.271128,0.146402,0.271128,0.146402
4,308,0.239865,0.126424,0.239865,0.126424
5,309,0.267912,0.283366,0.267912,0.283366
6,316,0.23568,0.304023,0.23568,0.304023
7,317,0.212704,0.143987,0.212704,0.143987
8,318,0.207079,0.170354,0.207079,0.170354
9,322,0.233237,0.128654,0.233237,0.128654


In [33]:
# Joining two dataframes - Courses and the Cosein Similarity Results based on the 'Course Id' variable. 
# Inner joins: Joins two tables with the common rows. This is a set operateion.

df_courses_score = df_courses.merge(df_cosdist, on ='Course Id', how='inner')

In [34]:
# Tranforming and shaping the data to create the confusion matrix for the ROLE: DATA SCIENTIST

my_DataScientist_skill_score = 'DataScientist_Skill_Score'
my_DataScientist_final_score = 'DataScientist_Final_Score'
y_actu_DataScientist         = ''
y_pred_DataScientist         = ''

df_courses_score[[my_DataScientist_final_score]] = df_courses_score[[my_DataScientist_skill_score]]

df_courses_score['DataScientist_Predict'] = (df_courses_score[[my_DataScientist_final_score]] >= 0.5)

df_courses_score['DataScientist_Label'] = df_courses_score.Category.isin(label_data_scientist)

y_pred_DataScientist = pd.Series(df_courses_score['DataScientist_Predict'], name='Predicted')

y_actu_DataScientist = pd.Series(df_courses_score['DataScientist_Label'], name='Actual')

df_confusion_DataScientist = pd.crosstab(y_actu_DataScientist, y_pred_DataScientist , rownames=['Actual'], colnames=['Predicted'], margins=False)


NameError: name 'my_DataScientist_Role_score' is not defined

In [22]:
# Tranforming and shaping the data to create the confusion matrix for the ROLE: SOFTWARE ENGINEER

my_SoftwareEngineer_skill_score = 'SoftwareEngineer_Skill_Score'
my_SoftwareEngineer_final_score = 'SoftwareEngineer_Final_Score'
y_actu_SoftwareEngineer         = ''
y_pred_SoftwareEngineer         = ''

df_courses_score[[my_SoftwareEngineer_final_score]] = df_courses_score[[my_SoftwareEngineer_skill_score]]

df_courses_score['SoftwareEngineer_Predict'] = (df_courses_score[[my_SoftwareEngineer_final_score]] >= 0.5)

df_courses_score['SoftwareEngineer_Label'] = df_courses_score.Category.isin(label_software_engineer)

y_pred_SoftwareEngineer = pd.Series(df_courses_score['SoftwareEngineer_Predict'], name='Predicted')

y_actu_SoftwareEngineer = pd.Series(df_courses_score['SoftwareEngineer_Label'], name='Actual')

df_confusion_SoftwareEngineer = pd.crosstab(y_actu_SoftwareEngineer, y_pred_SoftwareEngineer , rownames=['Actual'], colnames=['Predicted'], margins=False)


In [25]:
df_confusion_DataScientist


Predicted,False
Actual,Unnamed: 1_level_1
False,2087
True,126


In [26]:
df_confusion_SoftwareEngineer

Predicted,False
Actual,Unnamed: 1_level_1
False,2067
True,146


In [27]:
# Performance summary for the ROLE: DATA SCIENTIST


try:
    tn_DataScientist = df_confusion_DataScientist.iloc[0][False]
except:
    tn_DataScientist = 0
    
try:
    tp_DataScientist =  df_confusion_DataScientist.iloc[1][True]
except:
    tp_DataScientist = 0

    
try:
    fn_DataScientist = df_confusion_DataScientist.iloc[1][False]
except:
    fn_DataScientist = 0
    
try:
    fp_DataScientist =  df_confusion_DataScientist.iloc[0][True]
except:
    fp_DataScientist = 0  
    
    
total_count_DataScientist = tn_DataScientist + tp_DataScientist + fn_DataScientist + fp_DataScientist


print('Data Scientist Accuracy Rate : ', (tn_DataScientist + tp_DataScientist) / total_count_DataScientist)

print('Data Scientist Misclassifcation Rate : ', (fn_DataScientist + fp_DataScientist) / total_count_DataScientist)

print('Data Scientist True Positive Rate : ', tp_DataScientist / (tp_DataScientist + fn_DataScientist))

print('Data Scientist False Positive Rate : ', fp_DataScientist / (tn_DataScientist + fp_DataScientist))


Data Scientist Accuracy Rate :  0.9430637144148215
Data Scientist Misclassifcation Rate :  0.05693628558517849
Data Scientist True Positive Rate :  0.0
Data Scientist False Positive Rate :  0.0


In [28]:
# Performance summary for the ROLE: SOFTWARE ENGINEER


try:
    tn_SoftwareEngineer = df_confusion_SoftwareEngineer.iloc[0][False]
except:
    tn_SoftwareEngineer = 0
    
try:
    tp_SoftwareEngineer =  df_confusion_SoftwareEngineer.iloc[1][True]
except:
    tp_SoftwareEngineer = 0

    
try:
    fn_SoftwareEngineer = df_confusion_SoftwareEngineer.iloc[1][False]
except:
    fn_SoftwareEngineer = 0
    
try:
    fp_SoftwareEngineer =  df_confusion_SoftwareEngineer.iloc[0][True]
except:
    fp_SoftwareEngineer = 0  
    
    
total_count_SoftwareEngineer = tn_SoftwareEngineer + tp_SoftwareEngineer + fn_SoftwareEngineer + fp_SoftwareEngineer


print('Software Engineer Accuracy Rate : ', (tn_SoftwareEngineer + tp_SoftwareEngineer) / total_count_SoftwareEngineer)

print('Software Engineer Misclassifcation Rate : ', (fn_SoftwareEngineer + fp_SoftwareEngineer) / total_count_SoftwareEngineer)

print('Software Engineer True Positive Rate : ', tp_SoftwareEngineer / (tp_SoftwareEngineer + fn_SoftwareEngineer))

print('Software Engineer False Positive Rate : ', fp_SoftwareEngineer / (tn_SoftwareEngineer + fp_SoftwareEngineer))


Software Engineer Accuracy Rate :  0.9340262087663804
Software Engineer Misclassifcation Rate :  0.06597379123361952
Software Engineer True Positive Rate :  0.0
Software Engineer False Positive Rate :  0.0
