In [91]:
! pip install neattext




In [92]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
import seaborn as sn

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [93]:
df=pd.read_csv("/kaggle/input/udemy-course-recommendation/udemy_course_data.csv")

In [94]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13


In [95]:
df.shape

(3683, 18)

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   course_id            3683 non-null   int64 
 1   course_title         3683 non-null   object
 2   url                  3683 non-null   object
 3   is_paid              3683 non-null   bool  
 4   price                3683 non-null   int64 
 5   num_subscribers      3683 non-null   int64 
 6   num_reviews          3683 non-null   int64 
 7   num_lectures         3683 non-null   int64 
 8   level                3683 non-null   object
 9   content_duration     3683 non-null   object
 10  published_timestamp  3683 non-null   object
 11  subject              3683 non-null   object
 12  profit               3683 non-null   int64 
 13  published_date       3683 non-null   object
 14  published_time       3682 non-null   object
 15  year                 3683 non-null   int64 
 16  month 

In [97]:
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
profit                 0
published_date         0
published_time         1
year                   0
month                  0
day                    0
dtype: int64

# Cleaning course titles

* Removing stopwords...
  * Common words like 'the', 'is', 'in' removed

* Removing special characters...
  * Symbols like @, #, $, %, etc. stripped out

* Result stored in 'Clean_title' column
  * Text now ready for vectorization


In [98]:
df['Clean_title'] = df['course_title'].apply(nfx.remove_stopwords)

df['Clean_title'] = df['Clean_title'].apply(nfx.remove_special_characters)

In [99]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,Clean_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18,Ultimate Investment Banking Course
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9,Complete GST Course Certification Grow Practice
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19,Financial Modeling Business Analysts Consultants
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30,Beginner Pro Financial Analysis Excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,Maximize Profits Trading Options


# Converting text to numerical features using CountVectorizer

* Initializing CountVectorizer...
  * Tokenizing and building vocabulary from 'Clean_title'

* Transforming text data...
  * Text converted into sparse matrix of token counts

* Converting to dense matrix...
  * Matrix format changed for easier viewing or DataFrame conversion


In [100]:
cv=CountVectorizer()
cv_title=cv.fit_transform(df["Clean_title"])
cv_title.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

# Creating DataFrame from CountVectorizer output

* Converting sparse matrix to dense format
* Extracting feature names as column headers
* Building DataFrame 'df_words' with word counts per row
* Displaying first 5 rows of 'df_words'


In [101]:
df_words = pd.DataFrame(cv_title.todense(), columns=cv.get_feature_names_out())
df_words.head()

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculating cosine similarity matrix

* Using 'cv_title' word count matrix as input
* Computing pairwise cosine similarity between text entries
* Result: square matrix showing similarity scores (0 to 1) for each pair
* Higher value means more similar text documents


In [102]:
cosine_simi=cosine_similarity(cv_title)
cosine_simi

array([[1.        , 0.20412415, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20412415, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

# Creating course index mapping

* Creating a pandas Series with:
  * values = DataFrame row indices
  * index = unique course titles from 'course_title' column
* Dropping duplicate course titles to keep unique mappings
* Result: quick lookup from course title to its DataFrame index


In [103]:
course_index=pd.Series(df.index,index=df["course_title"]).drop_duplicates()
course_index

course_title
Ultimate Investment Banking Course                                0
Complete GST Course & Certification - Grow Your CA Practice       1
Financial Modeling for Business Analysts and Consultants          2
Beginner to Pro - Financial Analysis in Excel 2017                3
How To Maximize Your Profits Trading Options                      4
                                                               ... 
Learn jQuery from Scratch - Master of JavaScript library       3678
How To Design A WordPress Website With No Coding At All        3679
Learn and Build using Polymer                                  3680
CSS Animations: Create Amazing Effects on Your Website         3681
Using MODX CMS to Build Websites: A Beginner's Guide           3682
Length: 3683, dtype: int64

# Defining course recommendation function

* Input: partial course title and number of recommendations (default 10)
* Step 1: Search for courses containing the input title (case-insensitive)
  * If no matches found, return message "No courses found..."
* Step 2: Get index of first matching course
* Step 3: Retrieve cosine similarity scores for that course against all others
* Step 4: Sort scores from highest to lowest (excluding the course itself)
* Step 5: Select top similar courses based on sorted scores
* Step 6: Build DataFrame with:
  * course_title
  * url
  * price
  * num_subscribers
  * Similarity_Score (how close courses are)
* Step 7: Return top `numrec` recommendations


In [104]:
def recommend_course(title, numrec=10):
    # Find courses with partial title match (case-insensitive)
    matches = df[df['course_title'].str.lower().str.contains(title.lower(), regex=False, na=False)]
    
    if matches.empty:
        return "No courses found with that title. Try a different keyword."
    
    # Get index of the first matching course
    idx = course_index[matches['course_title'].iloc[0]]
    
    # Get similarity scores for the selected course
    scores = list(enumerate(cosine_simi[idx]))
    
    # Sort scores in descending order
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:]  # Skip the course itself
    
    # Extract indices and scores
    selected_indices = [i[0] for i in sorted_scores]
    selected_scores = [i[1] for i in sorted_scores]
    
    # Create recommendation DataFrame
    rec_df = df.iloc[selected_indices][['course_title', 'url', 'price', 'num_subscribers']].copy()
    rec_df['Similarity_Score'] = selected_scores
    
    return rec_df.head(numrec)

# Getting recommendations for "javascript"

* Running recommend_course with title keyword: "javascript"
* Requesting top 20 similar courses based on title content
* Returned DataFrame contains:
  * course titles related to "javascript"
  * corresponding URLs
  * prices
  * subscriber counts
  * similarity scores
* Displaying the recommendations for review


In [106]:
ans = recommend_course("javascript", 20)
ans

Unnamed: 0,course_title,url,price,num_subscribers,Similarity_Score
2963,PHP MySQL: Learn PHP MySQL with Project,https://www.udemy.com/php-mysql-project-learning/,20,2314,0.478091
2587,Complete Website & CMS in PHP & MySQL From Scr...,https://www.udemy.com/cms-admin-panel-in-php-m...,20,3357,0.46291
2654,Learn E-Commerce Website in PHP & MySQL From S...,https://www.udemy.com/ecommerce-website-in-php...,0,29990,0.46291
2640,Projects in PHP and MySQL,https://www.udemy.com/the-complete-web-develop...,60,10606,0.436436
3339,Learning Object-Oriented JavaScript,https://www.udemy.com/learning-object-oriented...,75,533,0.436436
2672,JavaScript for Beginners Welcome to learning J...,https://www.udemy.com/javascript-for-beginners...,50,10864,0.428571
3288,JavaScript Complete Guide to learning JavaScript,https://www.udemy.com/javascript-complete-guid...,200,1296,0.428571
2755,PHP & MySQL - Learn The Easy Way. Master PHP &...,https://www.udemy.com/learning-php-and-mysql-d...,50,5549,0.419314
3023,JavaScript Intro to learning JavaScript web pr...,https://www.udemy.com/javascript-intro-to-lear...,20,17554,0.400892
2506,Rapid Website Design with Bootstrap,https://www.udemy.com/responsive-website-design/,200,14842,0.377964
