# Content Based Filtering | Recommendation System

# Dataset | Udemy Courses

## Importing Libraries

In [164]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

## Importing Data

In [165]:
udemy = pd.read_csv('udemy_courses.csv')
udemy.head(3)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19


In [166]:
udemy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   course_id            3683 non-null   int64 
 1   course_title         3683 non-null   object
 2   url                  3683 non-null   object
 3   is_paid              3683 non-null   bool  
 4   price                3683 non-null   int64 
 5   num_subscribers      3683 non-null   int64 
 6   num_reviews          3683 non-null   int64 
 7   num_lectures         3683 non-null   int64 
 8   level                3683 non-null   object
 9   content_duration     3683 non-null   object
 10  published_timestamp  3683 non-null   object
 11  subject              3683 non-null   object
 12  profit               3683 non-null   int64 
 13  published_date       3683 non-null   object
 14  published_time       3682 non-null   object
 15  year                 3683 non-null   int64 
 16  month 

In [167]:
udemy.shape

(3683, 18)

## Pre-processing Data

In [168]:
import neattext.functions as nf

In [169]:
# Remove stopwords
udemy['Clean_title'] = udemy['course_title'].apply(nf.remove_stopwords)

In [170]:
#Remove special characters
udemy['Clean_title'] = udemy['Clean_title'].apply(nf.remove_special_characters)

In [171]:
udemy['Clean_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course  Certification  Grow Practice
2        Financial Modeling Business Analysts Consultants
3             Beginner Pro  Financial Analysis Excel 2017
4                        Maximize Profits Trading Options
                              ...                        
3678      Learn jQuery Scratch  Master JavaScript library
3679                      Design WordPress Website Coding
3680                                  Learn Build Polymer
3681        CSS Animations Create Amazing Effects Website
3682              MODX CMS Build Websites Beginners Guide
Name: Clean_title, Length: 3683, dtype: object

In [172]:
from sklearn.feature_extraction.text import CountVectorizer

In [173]:
cv = CountVectorizer()
cv_m = cv.fit_transform(udemy['Clean_title'])
cv_m.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [174]:
df_cv_w = pd.DataFrame(cv_m.toarray(),columns=cv.get_feature_names())
df_cv_w

Unnamed: 0,000005,001,01,02,10,100,101,101master,102,10k,...,zend,zero,zerotohero,zf2,zinsen,zoho,zombie,zu,zuhause,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Building Our Course Recommender (Cosine Similarity)

In [175]:
from sklearn.metrics.pairwise import cosine_similarity

In [176]:
# cosine similarity matrix
cos_sim_m = cosine_similarity(cv_m)
cos_sim_m

array([[1.        , 0.20412415, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20412415, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

In [177]:
def recommend_course(title,numrec = 10):
    
    course_index = pd.Series(
        udemy.index, index=udemy['course_title']).drop_duplicates()

    index = course_index[title]

    scores = list(enumerate(cos_sim_m[index]))

    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    selected_course_index = [i[0] for i in sorted_scores[1:]]

    selected_course_score = [i[1] for i in sorted_scores[1:]]

    rec_df = udemy.iloc[selected_course_index]

    rec_df['Similarity_Score'] = selected_course_score

    final_recommended_courses = rec_df[[
        'course_title', 'Similarity_Score', 'url', 'price', 'num_subscribers']]

    return final_recommended_courses.head(numrec)

In [178]:
ans = recommend_course('Beginner to Pro - Financial Analysis in Excel 2017',10)
ans.style.format({'Similarity_Score':'{0:,.2f}%'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['Similarity_Score'] = selected_course_score


Unnamed: 0,course_title,Similarity_Score,url,price,num_subscribers
38,Beginner to Pro in Excel: Financial Modeling and Valuation,0.67%,https://www.udemy.com/beginner-to-pro-in-excel-financial-modeling-and-valuation/,195,22257
60,Excel Crash Course: Master Excel for Financial Analysis,0.54%,https://www.udemy.com/excel-crash-course-master-excel-for-financial-analysis/,105,8121
1197,Financial Modeling and Valuation: Complete Beginner to Pro,0.50%,https://www.udemy.com/financial-modeling-and-valuation-complete-beginner-to-pro/,50,249
741,Financial Ratios Using Excel,0.47%,https://www.udemy.com/financialratios/,100,1223
649,Financial Statements Analysis: Learn to Invest Like a Pro!,0.46%,https://www.udemy.com/financial-statements-and-ratios-for-beginner-investors/,50,1237
984,Beginner Financial Analysis: Invest Like Warren Buffett,0.46%,https://www.udemy.com/beginner-financial-analysis-invest-like-warren-buffett/,50,83
103,Introduction to Financial Statement Analysis,0.41%,https://www.udemy.com/introduction-to-financial-statement-analysis/,20,1480
132,Building Financial Statements in Excel,0.41%,https://www.udemy.com/guide-to-building-financial-statements/,35,1181
170,Analysis of Company Financial Statements,0.41%,https://www.udemy.com/analysis-of-company-financial-statements/,20,832
259,Practical Financial Statement Analysis,0.41%,https://www.udemy.com/practical-financial-statement-analysis/,75,22


In [179]:
ans1 = recommend_course('How To Maximize Your Profits Trading Options',10)
ans1.style.format({'Similarity_Score':'{0:,.2f}%'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['Similarity_Score'] = selected_course_score


Unnamed: 0,course_title,Similarity_Score,url,price,num_subscribers
410,Trading Options Basics,0.58%,https://www.udemy.com/trading-options-basics/,200,8
43,Options Trading - How to Win with Weekly Options,0.57%,https://www.udemy.com/work-from-home-setup-your-own-options-trading-business/,115,7489
96,Intermediate Options trading concepts for Stocks and Options,0.53%,https://www.udemy.com/intermediate-options-trading-concepts-for-stocks-and-options-traders/,40,2000
138,"Forex Trading with Fixed 'Risk through Options Trading""",0.53%,https://www.udemy.com/forexoptions/,200,611
195,Trading Options For Consistent Returns: Options Basics,0.53%,https://www.udemy.com/trading-options-for-income/,0,4077
444,The Advantages of ETF Options and Index Options Trading,0.53%,https://www.udemy.com/learn-etf-options-and-index-options-trading/,60,52
803,Options Spreads Bundle- the heart of Options Trading,0.53%,https://www.udemy.com/options-spreads-explained/,120,623
11,Trading Options With Money Flow,0.50%,https://www.udemy.com/trading-options-using-money-flow/,200,1380
59,How to Buy Cheap Options - Options Trading Pricing Model,0.50%,https://www.udemy.com/options-black-scholes-model/,200,658
68,How I Make Consistent Returns Trading Options,0.50%,https://www.udemy.com/how-i-make-15-per-month-trading-options/,200,2310


==========

### THANK YOU!