In [198]:
# **************************** IMPORTANT ****************************
# This cell configuration settings for the Notebook. 
# You can run one role at a time to evaluate the performance of the model
# Change the variable names to run for multiple roles


# *******************************************************************
# For each role a list of category names are grouped. 
# Please don't change these variables

label_data_scientist = ['Data Science','Machine Learning',
                           'Data Analysis', 'Business Intelligence',
                           'Data Mining','Data Visualization']

label_software_engineer = ['Software Development','Computer Science',
                           'Programming Languages', 'Software Development',
                           'Web Development','Algorithms and Data Structures']
                                       
# *******************************************************************


# *******************************************************************
# Environment and Config Variables. Change these variables as required.

my_fpath_courses = "../Data/main_coursera.csv"

my_fpath_skills_DataScientist = "../Data/Cosine-Distance/Single-Variable/CosDist_DataScientist.csv"

my_fpath_skills_SoftwareEngineer = "../Data/Cosine-Distance/Single-Variable/CosDist_SoftwareEngineer.csv" 
                                       
# *******************************************************************


In [199]:
# Importing required modules/packages

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk, string


In [200]:
# Downloading the stopwords like i, me, and, is, the etc.

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/DV/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [201]:
# Loading courses and skills data from the CSV files

df_courses = pd.read_csv(my_fpath_courses)

df_DataScientist = pd.read_csv(my_fpath_skills_DataScientist)
df_DataScientist = df_DataScientist.drop('Role', 1)
df_DataScientist.columns = ['Course Id', 'DataScientist_Skill_Score', 'DataScientist_Role_Score']

df_SoftwareEngineer = pd.read_csv(my_fpath_skills_SoftwareEngineer)
df_SoftwareEngineer = df_SoftwareEngineer.drop('Role', 1)
df_SoftwareEngineer.columns = ['Course Id','SoftwareEngineer_Skill_Score', 'SoftwareEngineer_Role_Score']


In [202]:
# Merging the csv files

df_cosdist = df_DataScientist.merge(df_SoftwareEngineer, on = 'Course Id', how = 'outer')


In [203]:
# Exploring data dimensionality, feature names, and feature types.

print(df_courses.shape,"\n")

print(df_cosdist.shape,"\n")

print(df_courses.columns, "\n")

print(df_cosdist.shape,"\n")

print(df_courses.describe(), "\n")

print(df_cosdist.describe(), "\n")


(2213, 18) 

(2213, 5) 

Index(['Unnamed: 0', 'Course Id', 'Course Name', 'Course Description', 'Slug',
       'Provider', 'Universities/Institutions', 'Parent Subject',
       'Child Subject', 'Category', 'Url', 'Length', 'Language',
       'Credential Name', 'Rating', 'Number of Ratings', 'Certificate',
       'Workload'],
      dtype='object') 

(2213, 5) 

        Unnamed: 0    Course Id      Length       Rating  Number of Ratings  \
count  2213.000000  2213.000000  964.000000  2213.000000        2213.000000   
mean   1430.685043  4816.998192    6.063278     2.352785          10.321735   
std     887.770407  3033.878865    2.724669     2.129134         110.680382   
min       0.000000   303.000000    1.000000     0.000000           0.000000   
25%     631.000000  1829.000000    4.000000     0.000000           0.000000   
50%    1455.000000  4880.000000    6.000000     3.000000           1.000000   
75%    2216.000000  7329.000000    7.000000     4.428571           4.000000   
max  

In [204]:
# Quick check to see if the dataframe showing the right results

df_cosdist.head(20)

Unnamed: 0,Course Id,DataScientist_Skill_Score,DataScientist_Role_Score,SoftwareEngineer_Skill_Score,SoftwareEngineer_Role_Score
0,303,0.346929,0.744684,0.279033,0.496048
1,305,0.309625,0.535167,0.284386,0.69552
2,306,0.370422,0.449385,0.275596,0.530443
3,307,0.356481,0.41626,0.271005,0.629936
4,308,0.362082,0.284999,0.278486,0.589188
5,309,0.3725,0.52217,0.287818,0.620018
6,316,0.387596,0.310164,0.291896,0.596297
7,317,0.33823,0.375398,0.26923,0.449072
8,318,0.343533,0.473992,0.278539,0.456083
9,322,0.370727,0.642344,0.285038,0.629259


In [205]:
# Joining two dataframes - Courses and the Cosein Similarity Results based on the 'Course Id' variable. 
# Inner joins: Joins two tables with the common rows. This is a set operateion.

df_courses_score = df_courses.merge(df_cosdist, on ='Course Id', how='inner')

In [211]:
# Tranforming and shaping the data to create the confusion matrix for the ROLE: SOFTWARE ENGINEER

my_SoftwareEngineer_skill_score = 'SoftwareEngineer_Skill_Score'
my_SoftwareEngineer_final_score = 'SoftwareEngineer_Final_Score'
my_SoftwareEngineer_predict     = 'SoftwareEngineer_Predict'
my_SoftwareEngineer_label       = 'SoftwareEngineer_Label'
y_actu_SoftwareEngineer         = ''
y_pred_SoftwareEngineer         = ''

df_courses_score[[my_SoftwareEngineer_final_score]] = df_courses_score[[my_SoftwareEngineer_skill_score]]

df_courses_score[[my_SoftwareEngineer_predict]] = (df_courses_score[[my_SoftwareEngineer_final_score]] >= 0.5)

df_courses_score[[my_SoftwareEngineer_label]] = df_courses_score.Category.isin(label_software_engineer)

y_actu_SoftwareEngineer = pd.Series(df_courses_score['SoftwareEngineer_Label'], name='Actual')

y_pred_SoftwareEngineer = pd.Series(df_courses_score['SoftwareEngineer_Predict'], name='Predicted')

df_confusion_SoftwareEngineer = pd.crosstab(y_actu_SoftwareEngineer, y_pred_SoftwareEngineer , rownames=['Actual'], colnames=['Predicted'], margins=False)


In [212]:
df_confusion_SoftwareEngineer


Predicted,False
Actual,Unnamed: 1_level_1
False,2067
True,146


In [213]:
df_confusion_SoftwareEngineer.iloc[1][False]

146

In [214]:
try:
    true_negative = df_confusion_SoftwareEngineer.iloc[0][False]
except:
    true_negative = 0
    
try:
    true_positive =  df_confusion_SoftwareEngineer.iloc[1][True]
except:
    true_positive = 0

    
try:
    false_negative = df_confusion_SoftwareEngineer.iloc[1][False]
except:
    false_negative = 0
    
try:
    false_positive =  df_confusion_SoftwareEngineer.iloc[0][True]
except:
    false_positive = 0  
    
    
total_count = true_negative + true_positive + false_negative + false_positive



print('Accuracy Rate : ', (true_negative + true_positive)/ total_count)

print('Misclassifcation Rate : ', (false_negative + false_positive)/ total_count)

print('True Positive Rate : ', true_positive / (true_positive + false_negative))

print('False Positive Rate : ', false_positive / (true_negative + false_positive))


Accuracy Rate :  0.9340262087663804
Misclassifcation Rate :  0.06597379123361952
True Positive Rate :  0.0
False Positive Rate :  0.0
