In [235]:
# **************************** IMPORTANT ****************************
'''
This cell configuration settings for the Notebook. 
You can run one role at a time to evaluate the performance of the model
Change the variable names to run for multiple roles

In this model - cosine distance is calculated between the skills and
the course description. Each skill has a weighted score based on the 
popularity of the skill. This is derived by endorsements of the respective
skill by other linkedin connections.
'''


# *******************************************************************
# For each role a list of category names are grouped. 
# Please don't change these variables

label_data_scientist = ['Data Science','Machine Learning',
                           'Data Analysis', 'Business Intelligence',
                           'Data Mining','Data Visualization']

label_software_development = ['Software Development','Computer Science',
                           'Programming Languages', 'Software Development',
                           'Web Development','Algorithms and Data Structures']
                                       
# *******************************************************************


# *******************************************************************
# Environment and Config Variables. Change these variables as required.

my_fpath_courses = "../Data/main_coursera.csv"

my_fpath_skills_DataScientist = "../Data/Word2Vec-Google/Word2VecGoogle_DataScientist.csv"

my_fpath_skills_SoftwareDevelopment = "../Data/Word2Vec-Google/Word2VecGoogle_SoftwareDevelopment.csv" 
                                       
# *******************************************************************


In [236]:
# Importing required modules/packages

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk, string


In [237]:
# Downloading the stopwords like i, me, and, is, the etc.

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/DV/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [238]:
# Loading courses and skills data from the CSV files

df_courses = pd.read_csv(my_fpath_courses)

df_DataScientist = pd.read_csv(my_fpath_skills_DataScientist)
df_DataScientist = df_DataScientist.drop('Role', 1)
df_DataScientist.columns = ['Course Id', 'DataScientist_Skill_Score', 'DataScientist_Role_Score']

df_SoftwareDevelopment = pd.read_csv(my_fpath_skills_SoftwareDevelopment)
df_SoftwareDevelopment = df_SoftwareDevelopment.drop('Role', 1)
df_SoftwareDevelopment.columns = ['Course Id','SoftwareDevelopment_Skill_Score', 'SoftwareDevelopment_Role_Score']


FileNotFoundError: File b'../Data/Word2Vec-Google/Word2VecGoogle_SoftwareDevelopment.csv' does not exist

In [239]:
# Merging the csv files

df_cosdist = df_DataScientist.merge(df_SoftwareDevelopment, on = 'Course Id', how = 'outer')


In [240]:
# Exploring data dimensionality, feature names, and feature types.

print(df_courses.shape,"\n")

print(df_cosdist.shape,"\n")

print(df_courses.columns, "\n")

print(df_cosdist.shape,"\n")

print(df_courses.describe(), "\n")

print(df_cosdist.describe(), "\n")


(2213, 18) 

(2213, 5) 

Index(['Unnamed: 0', 'Course Id', 'Course Name', 'Course Description', 'Slug',
       'Provider', 'Universities/Institutions', 'Parent Subject',
       'Child Subject', 'Category', 'Url', 'Length', 'Language',
       'Credential Name', 'Rating', 'Number of Ratings', 'Certificate',
       'Workload'],
      dtype='object') 

(2213, 5) 

        Unnamed: 0    Course Id      Length       Rating  Number of Ratings  \
count  2213.000000  2213.000000  964.000000  2213.000000        2213.000000   
mean   1430.685043  4816.998192    6.063278     2.352785          10.321735   
std     887.770407  3033.878865    2.724669     2.129134         110.680382   
min       0.000000   303.000000    1.000000     0.000000           0.000000   
25%     631.000000  1829.000000    4.000000     0.000000           0.000000   
50%    1455.000000  4880.000000    6.000000     3.000000           1.000000   
75%    2216.000000  7329.000000    7.000000     4.428571           4.000000   
max  

In [241]:
# Quick check to see if the dataframe showing the right results

df_cosdist.head(20)

Unnamed: 0,Course Id,DataScientist_Skill_Score,DataScientist_Role_Score,SoftwareDevelopment_Skill_Score,SoftwareDevelopment_Role_Score
0,303,0.171523,0.268095,0.249944,0.13597
1,305,0.15764,0.205215,0.32853,0.43783
2,306,0.073949,0.124907,0.212947,0.069229
3,307,0.164807,0.146402,0.271968,0.050458
4,308,0.137668,0.126424,0.267256,0.119867
5,309,0.156846,0.283366,0.268586,0.186795
6,316,0.097714,0.304023,0.231349,0.106512
7,317,0.131158,0.143987,0.224335,0.064702
8,318,0.091267,0.170354,0.247371,0.223502
9,322,0.117883,0.128654,0.210427,0.151994


In [242]:
# Joining two dataframes - Courses and the Cosein Similarity Results based on the 'Course Id' variable. 
# Inner joins: Joins two tables with the common rows. This is a set operateion.

df_courses_score = df_courses.merge(df_cosdist, on ='Course Id', how='inner')

In [243]:
df_courses_score.sort_values(by=['SoftwareDevelopment_Skill_Score'], ascending=False).head(40)

Unnamed: 0.1,Unnamed: 0,Course Id,Course Name,Course Description,Slug,Provider,Universities/Institutions,Parent Subject,Child Subject,Category,...,Language,Credential Name,Rating,Number of Ratings,Certificate,Workload,DataScientist_Skill_Score,DataScientist_Role_Score,SoftwareDevelopment_Skill_Score,SoftwareDevelopment_Role_Score
2023,2695,9235,Develop and Deploy Windows Applications on Goo...,Learn to deploy and run Microsoft Windows® app...,coursera-develop-and-deploy-windows-applicatio...,Coursera,Google Cloud|||Google,Programming,Mobile Development,Mobile Development,...,English,,0.0,0,1.0,6-8 hours a week,0.192158,0.165044,0.417759,0.244278
2102,2810,9568,"JavaScript, jQuery, and JSON","In this course, we'll look at the JavaScript l...",coursera-javascript-jquery-and-json,Coursera,University of Michigan,Programming,Web Development,Web Development,...,English,,0.0,0,1.0,10 hours a week,0.208689,0.087937,0.402414,0.120868
1545,2057,6944,Advanced Java Concurrency,Advanced Java Concurrency focuses on the objec...,coursera-advanced-java-concurrency,Coursera,Vanderbilt University,Programming,Programming Languages,Programming Languages,...,English,,0.0,0,1.0,,0.174647,0.152139,0.398325,0.244047
1905,2548,8684,Multiplatform Mobile App Development with Nati...,This course focuses on developing truly cross-...,coursera-multiplatform-mobile-app-development-...,Coursera,The Hong Kong University of Science and Techno...,Programming,Mobile Development,Mobile Development,...,English,,0.0,0,1.0,6-8 hours a week,0.182148,0.158894,0.398021,0.272968
919,1104,4191,"Responsive Website Basics: Code with HTML, CSS...",In this course you will learn three key websit...,coursera-responsive-website-basics-code-with-h...,Coursera,University of London International Programmes|...,Programming,Web Development,Web Development,...,English,Responsive Website Development and Design,4.041667,24,1.0,,0.176972,0.12611,0.38723,0.171339
1904,2547,8683,Multiplatform Mobile App Development with Web ...,This course focuses on developing multiplatfor...,coursera-multiplatform-mobile-app-development-...,Coursera,The Hong Kong University of Science and Techno...,Programming,Mobile Development,Mobile Development,...,English,,0.0,0,1.0,,0.15868,0.243503,0.384977,0.255416
2039,2716,9319,Linux Server Management and Security,"Whether you are accessing a bank website, Netf...",coursera-linux-server-management-and-security,Coursera,University of Colorado System,Computer Science,Information Technology,Information Technology,...,English,,0.0,0,1.0,,0.164308,0.12099,0.384064,0.308126
2100,2808,9566,Building Web Applications in PHP,"In this course, you'll explore the basic struc...",coursera-building-web-applications-in-php,Coursera,University of Michigan,Programming,Web Development,Web Development,...,English,,0.0,0,1.0,8-10 hours a week,0.20741,0.196334,0.374962,0.300837
1194,1581,5446,Java for Android,This MOOC teaches you how to program core feat...,coursera-java-for-android,Coursera,Vanderbilt University,Programming,Android Development,Android Development,...,English,,1.0,1,1.0,,0.171521,0.138278,0.374124,0.147813
575,657,2039,Pattern-Oriented Software Architectures for Co...,Learn how to apply patterns and frameworks to ...,coursera-pattern-oriented-software-architectur...,Coursera,Vanderbilt University,Programming,Software Development,Software Development,...,English,,5.0,3,1.0,6-8 hours a week,0.153376,0.268718,0.372576,0.464903


In [249]:
# Tranforming and shaping the data to create the confusion matrix for the ROLE: DATA SCIENTIST

my_DataScientist_skill_score = 'DataScientist_Role_Score'
my_DataScientist_final_score = 'DataScientist_Final_Score'
y_actu_DataScientist         = ''
y_pred_DataScientist         = ''

df_courses_score[[my_DataScientist_final_score]] = df_courses_score[[my_DataScientist_skill_score]]

df_courses_score['DataScientist_Predict'] = (df_courses_score[[my_DataScientist_final_score]] >= 0.)

df_courses_score['DataScientist_Label'] = df_courses_score.Category.isin(label_data_scientist)

y_pred_DataScientist = pd.Series(df_courses_score['DataScientist_Predict'], name='Predicted')

y_actu_DataScientist = pd.Series(df_courses_score['DataScientist_Label'], name='Actual')

df_confusion_DataScientist = pd.crosstab(y_actu_DataScientist, y_pred_DataScientist , rownames=['Actual'], colnames=['Predicted'], margins=False)


In [250]:
# Tranforming and shaping the data to create the confusion matrix for the ROLE: SOFTWARE ENGINEER

my_SoftwareDevelopment_skill_score = 'SoftwareDevelopment_Role_Score'
my_SoftwareDevelopment_final_score = 'SoftwareDevelopment_Final_Score'
y_actu_SoftwareDevelopment         = ''
y_pred_SoftwareDevelopment         = ''

df_courses_score[[my_SoftwareDevelopment_final_score]] = df_courses_score[[my_SoftwareDevelopment_skill_score]]

df_courses_score['SoftwareDevelopment_Predict'] = (df_courses_score[[my_SoftwareDevelopment_final_score]] >= 0.35)

df_courses_score['SoftwareDevelopment_Label'] = df_courses_score.Category.isin(label_software_engineer)

y_pred_SoftwareDevelopment = pd.Series(df_courses_score['SoftwareDevelopment_Predict'], name='Predicted')

y_actu_SoftwareDevelopment = pd.Series(df_courses_score['SoftwareDevelopment_Label'], name='Actual')

df_confusion_SoftwareDevelopment = pd.crosstab(y_actu_SoftwareDevelopment, y_pred_SoftwareDevelopment , rownames=['Actual'], colnames=['Predicted'], margins=False)


In [251]:
df_confusion_DataScientist


Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2085,2
True,125,1


In [252]:
df_confusion_SoftwareDevelopment

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2056,11
True,127,19


In [253]:
# Performance summary for the ROLE: DATA SCIENTIST


try:
    tn_DataScientist = df_confusion_DataScientist.iloc[0][False]
except:
    tn_DataScientist = 0
    
try:
    tp_DataScientist =  df_confusion_DataScientist.iloc[1][True]
except:
    tp_DataScientist = 0

    
try:
    fn_DataScientist = df_confusion_DataScientist.iloc[1][False]
except:
    fn_DataScientist = 0
    
try:
    fp_DataScientist =  df_confusion_DataScientist.iloc[0][True]
except:
    fp_DataScientist = 0  

    
total_count_DataScientist = tn_DataScientist + tp_DataScientist + fn_DataScientist + fp_DataScientist


print('Data Scientist Accuracy Rate : ', (tn_DataScientist + tp_DataScientist) / total_count_DataScientist)

print('Data Scientist Misclassifcation Rate : ', (fn_DataScientist + fp_DataScientist) / total_count_DataScientist)

print('Data Scientist True Positive Rate : ', tp_DataScientist / (tp_DataScientist + fn_DataScientist))

print('Data Scientist False Positive Rate : ', fp_DataScientist / (tn_DataScientist + fp_DataScientist))



Data Scientist Accuracy Rate :  0.9426118391323994
Data Scientist Misclassifcation Rate :  0.05738816086760054
Data Scientist True Positive Rate :  0.007936507936507936
Data Scientist False Positive Rate :  0.0009583133684714902


In [254]:
# Performance summary for the ROLE: SOFTWARE ENGINEER


try:
    tn_SoftwareDevelopment = df_confusion_SoftwareDevelopment.iloc[0][False]
except:
    tn_SoftwareDevelopment = 0
    
try:
    tp_SoftwareDevelopment =  df_confusion_SoftwareDevelopment.iloc[1][True]
except:
    tp_SoftwareDevelopment = 0

    
try:
    fn_SoftwareDevelopment = df_confusion_SoftwareDevelopment.iloc[1][False]
except:
    fn_SoftwareDevelopment = 0
    
try:
    fp_SoftwareDevelopment =  df_confusion_SoftwareDevelopment.iloc[0][True]
except:
    fp_SoftwareDevelopment = 0  
    
    
total_count_SoftwareDevelopment = tn_SoftwareDevelopment + tp_SoftwareDevelopment + fn_SoftwareDevelopment + fp_SoftwareDevelopment


print('Software Development Accuracy Rate : ', (tn_SoftwareDevelopment + tp_SoftwareDevelopment) / total_count_SoftwareDevelopment)

print('Software Development Misclassifcation Rate : ', (fn_SoftwareDevelopment + fp_SoftwareDevelopment) / total_count_SoftwareDevelopment)

print('Software Development True Positive Rate : ', tp_SoftwareDevelopment / (tp_SoftwareDevelopment + fn_SoftwareDevelopment))

print('Software Development False Positive Rate : ', fp_SoftwareDevelopment / (tn_SoftwareDevelopment + fp_SoftwareDevelopment))


Software Development Accuracy Rate :  0.9376412110257569
Software Development Misclassifcation Rate :  0.06235878897424311
Software Development True Positive Rate :  0.13013698630136986
Software Development False Positive Rate :  0.005321722302854378
