In [158]:
import pandas as pd
import warnings

In [None]:
warnings.filterwarnings("ignore")  # to ignore warnings

In [160]:
# Load the dataset
data = pd.read_csv("udemy_courses.csv")
data.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [161]:
data.shape

(3678, 12)

In [162]:
data.isnull().sum() #to check for null values

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [163]:
data.duplicated().sum() #to check for duplicate values

np.int64(6)

In [164]:
data[data.duplicated()]  #to view the duplicate values

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [165]:
data = data.drop_duplicates()
data.shape  #to drop the duplicate values

(3672, 12)

In [166]:
data.columns  #to view the columns in the dataset

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

In [167]:
def pop_rem(data):
    data['pop_score'] = 0.6* data['num_subscribers'] + 0.4 * data['num_reviews']
    # sort the dataset by pop_score
    df_sorted = data.sort_values(by='pop_score', ascending=False)[[ 'course_title', 'pop_score']].head(5)
    return df_sorted
    

In [168]:
pop_rem(data)   #to get the top 10 courses based on popularity score

Unnamed: 0,course_title,pop_score
2827,Learn HTML5 Programming From Scratch,164805.4
3032,Coding for Entrepreneurs Basic,96729.0
3230,The Web Developer Bootcamp,83928.4
3232,The Complete Web Developer Course 2.0,77672.0
2783,Build Your First Website in 1 Week with HTML5 ...,74544.2


In [169]:
import neattext.functions as nfx

In [170]:
#  remove stopwords form column 'course_title'
data['course_title'] = data['course_title'].apply(nfx.remove_stopwords) 
# remove special characters from column 'course_title'
data['course_title'] = data['course_title'].apply(nfx.remove_special_characters)
data['course_title'].head()  #to view the first few rows of the column after removing stopwords

0                   Ultimate Investment Banking Course
1    Complete GST Course  Certification  Grow Practice
2     Financial Modeling Business Analysts Consultants
3          Beginner Pro  Financial Analysis Excel 2017
4                     Maximize Profits Trading Options
Name: course_title, dtype: object

In [None]:
data['title_subject'] = data['course_title'] + '' + data['subject'] #to create a new column by concatenating 'course_title' and 'subject'
data.head(1)  # to view the first few rows of the dataset 

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,pop_score,title_subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,1297.4,Ultimate Investment Banking CourseBusiness Fin...


In [None]:

from sklearn.feature_extraction.text import CountVectorizer
cv  = CountVectorizer(max_features=3000, stop_words='english')  #to create a count vectorizer object with a maximum of 3000 features
vectors = cv.fit_transform(data['title_subject']).toarray()  # to transform the text data into a matrix of token counts
vectors 

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [173]:
feature_names = cv.get_feature_names_out()
print( len(feature_names) )  #to get the number of features


3000


In [None]:
# calculate the cosine similarity between the vectors 
from sklearn.metrics.pairwise import cosine_similarity 
similarity = cosine_similarity(vectors)  #to calculate the cosine similarity between the vectors
similarity  

array([[1.        , 0.16903085, 0.2236068 , ..., 0.        , 0.        ,
        0.        ],
       [0.16903085, 1.        , 0.18898224, ..., 0.        , 0.        ,
        0.        ],
       [0.2236068 , 0.18898224, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.21821789,
        0.43643578],
       [0.        , 0.        , 0.        , ..., 0.21821789, 1.        ,
        0.14285714],
       [0.        , 0.        , 0.        , ..., 0.43643578, 0.14285714,
        1.        ]])

In [175]:
similarity.shape  #to get the shape of the similarity matrix

(3672, 3672)

In [176]:
similarity[0]  #to get the similarity scores for the first course

array([1.        , 0.16903085, 0.2236068 , ..., 0.        , 0.        ,
       0.        ])

In [177]:
sorted(enumerate(similarity[0]), reverse=True, key=lambda x: x[1])  # to sort the similarity scores in descending order

[(0, np.float64(0.9999999999999999)),
 (417, np.float64(0.7745966692414834)),
 (39, np.float64(0.5477225575051662)),
 (657, np.float64(0.5477225575051662)),
 (1066, np.float64(0.5477225575051662)),
 (227, np.float64(0.50709255283711)),
 (945, np.float64(0.50709255283711)),
 (41, np.float64(0.47434164902525683)),
 (137, np.float64(0.47434164902525683)),
 (528, np.float64(0.47434164902525683)),
 (722, np.float64(0.47434164902525683)),
 (240, np.float64(0.4472135954999579)),
 (285, np.float64(0.4472135954999579)),
 (418, np.float64(0.4472135954999579)),
 (450, np.float64(0.4472135954999579)),
 (503, np.float64(0.4472135954999579)),
 (942, np.float64(0.4472135954999579)),
 (849, np.float64(0.44721359549995787)),
 (1183, np.float64(0.40451991747794525)),
 (9, np.float64(0.39999999999999997)),
 (120, np.float64(0.39999999999999997)),
 (162, np.float64(0.39999999999999997)),
 (297, np.float64(0.39999999999999997)),
 (354, np.float64(0.39999999999999997)),
 (369, np.float64(0.39999999999999997

In [178]:
def content_recomm(course):
    course_index = data[data['course_title'] == course].index[0]  #to get the index of the course
    sim = similarity[course_index]  #to get the similarity scores for the course
    course_list = sorted(enumerate(sim), reverse=True, key=lambda x: x[1])[1:6]  #to sort the similarity scores in descending order
    for i in course_list:
        print(data.iloc[i[0]]['course_title'])  #to get the course title for each course in the list
        

In [180]:
content_recomm('Ultimate Investment Banking Course')  #to get the recommended courses for the given course

Investment Banking Recruitment Series
Complete Investment Banking Course 2017
Financial Accounting  Ultimate Beginner Course
Managerial Accounting  Ultimate Beginner Course
Investment Banking Land Job Wall Street
