# Content Based Recommendation System

## Data => PreProcessing => Model Building => Deployement

### DOWNLOAD DATASET FROM KAGGLE USING APIS

In [1]:
! pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle

In [4]:
! kaggle datasets download -d andrewmvd/udemy-courses

Downloading udemy-courses.zip to /content
100% 200k/200k [00:00<00:00, 559kB/s]
100% 200k/200k [00:00<00:00, 559kB/s]


In [5]:
! unzip -qq udemy-courses.zip

# EDA

In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('/content/udemy_courses.csv')

In [8]:
df.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance


In [9]:
df.tail(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
3677,297602,Using MODX CMS to Build Websites: A Beginner's...,https://www.udemy.com/using-modx-cms-to-build-...,True,45,901,36,20,Beginner Level,2.0,2014-09-28T19:51:11Z,Web Development


In [10]:
print(df.shape)

(3678, 12)


Dropping Columns of non usage

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [13]:
# course_id, course_title, url, subject are only of use
df = df[['course_id','course_title', 'url', 'subject']]

In [17]:
df.head()

Unnamed: 0,course_id,course_title,url,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,Business Finance


## Handling Missing data

In [15]:
df.isnull().sum()

course_id       0
course_title    0
url             0
subject         0
dtype: int64

In [20]:
df.duplicated().sum()

6

In [25]:
df.drop_duplicates(inplace=True) #It will remove all the Duplicate values

In [26]:
df.head()

Unnamed: 0,course_id,course_title,url,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,Business Finance


In [28]:
df['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3673    Learn jQuery from Scratch - Master of JavaScri...
3674    How To Design A WordPress Website With No Codi...
3675                        Learn and Build using Polymer
3676    CSS Animations: Create Amazing Effects on Your...
3677    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3672, dtype: object

In [29]:
# df['course_title'] = df['course_title'].apply(lambda x:x.split())

In [79]:
# df['subject'] = df['subject'].apply(lambda x:x.split())

In [80]:
df.head()

Unnamed: 0,course_id,course_title,url,subject,tags
0,1070968,"[Ultimate, Investment, Banking, Course]",https://www.udemy.com/ultimate-investment-bank...,"[Business, Finance]","[Ultimate, Investment, Banking, Course, Busine..."
1,1113822,"[Complete, GST, Course, &, Certification, -, G...",https://www.udemy.com/goods-and-services-tax/,"[Business, Finance]","[Complete, GST, Course, &, Certification, -, G..."
2,1006314,"[Financial, Modeling, for, Business, Analysts,...",https://www.udemy.com/financial-modeling-for-b...,"[Business, Finance]","[Financial, Modeling, for, Business, Analysts,..."
3,1210588,"[Beginner, to, Pro, -, Financial, Analysis, in...",https://www.udemy.com/complete-excel-finance-c...,"[Business, Finance]","[Beginner, to, Pro, -, Financial, Analysis, in..."
4,1011058,"[How, To, Maximize, Your, Profits, Trading, Op...",https://www.udemy.com/how-to-maximize-your-pro...,"[Business, Finance]","[How, To, Maximize, Your, Profits, Trading, Op..."


String Space Removing as tags can be different

In [81]:
df['course_title'] = df['course_title'].apply(lambda x:[i.replace(" ","") for i in x])
df['subject']=df['subject'].apply(lambda x:[i.replace(" ","") for i in x])

In [82]:
df.head()

Unnamed: 0,course_id,course_title,url,subject,tags
0,1070968,"[Ultimate, Investment, Banking, Course]",https://www.udemy.com/ultimate-investment-bank...,"[Business, Finance]","[Ultimate, Investment, Banking, Course, Busine..."
1,1113822,"[Complete, GST, Course, &, Certification, -, G...",https://www.udemy.com/goods-and-services-tax/,"[Business, Finance]","[Complete, GST, Course, &, Certification, -, G..."
2,1006314,"[Financial, Modeling, for, Business, Analysts,...",https://www.udemy.com/financial-modeling-for-b...,"[Business, Finance]","[Financial, Modeling, for, Business, Analysts,..."
3,1210588,"[Beginner, to, Pro, -, Financial, Analysis, in...",https://www.udemy.com/complete-excel-finance-c...,"[Business, Finance]","[Beginner, to, Pro, -, Financial, Analysis, in..."
4,1011058,"[How, To, Maximize, Your, Profits, Trading, Op...",https://www.udemy.com/how-to-maximize-your-pro...,"[Business, Finance]","[How, To, Maximize, Your, Profits, Trading, Op..."


In [83]:
df['tags'] = df['course_title'] + df['subject']

In [84]:
df.head()

Unnamed: 0,course_id,course_title,url,subject,tags
0,1070968,"[Ultimate, Investment, Banking, Course]",https://www.udemy.com/ultimate-investment-bank...,"[Business, Finance]","[Ultimate, Investment, Banking, Course, Busine..."
1,1113822,"[Complete, GST, Course, &, Certification, -, G...",https://www.udemy.com/goods-and-services-tax/,"[Business, Finance]","[Complete, GST, Course, &, Certification, -, G..."
2,1006314,"[Financial, Modeling, for, Business, Analysts,...",https://www.udemy.com/financial-modeling-for-b...,"[Business, Finance]","[Financial, Modeling, for, Business, Analysts,..."
3,1210588,"[Beginner, to, Pro, -, Financial, Analysis, in...",https://www.udemy.com/complete-excel-finance-c...,"[Business, Finance]","[Beginner, to, Pro, -, Financial, Analysis, in..."
4,1011058,"[How, To, Maximize, Your, Profits, Trading, Op...",https://www.udemy.com/how-to-maximize-your-pro...,"[Business, Finance]","[How, To, Maximize, Your, Profits, Trading, Op..."


In [85]:
new_df = df[['course_title', 'subject','tags']]

In [86]:
new_df

Unnamed: 0,course_title,subject,tags
0,"[Ultimate, Investment, Banking, Course]","[Business, Finance]","[Ultimate, Investment, Banking, Course, Busine..."
1,"[Complete, GST, Course, &, Certification, -, G...","[Business, Finance]","[Complete, GST, Course, &, Certification, -, G..."
2,"[Financial, Modeling, for, Business, Analysts,...","[Business, Finance]","[Financial, Modeling, for, Business, Analysts,..."
3,"[Beginner, to, Pro, -, Financial, Analysis, in...","[Business, Finance]","[Beginner, to, Pro, -, Financial, Analysis, in..."
4,"[How, To, Maximize, Your, Profits, Trading, Op...","[Business, Finance]","[How, To, Maximize, Your, Profits, Trading, Op..."
...,...,...,...
3673,"[Learn, jQuery, from, Scratch, -, Master, of, ...","[Web, Development]","[Learn, jQuery, from, Scratch, -, Master, of, ..."
3674,"[How, To, Design, A, WordPress, Website, With,...","[Web, Development]","[How, To, Design, A, WordPress, Website, With,..."
3675,"[Learn, and, Build, using, Polymer]","[Web, Development]","[Learn, and, Build, using, Polymer, Web, Devel..."
3676,"[CSS, Animations:, Create, Amazing, Effects, o...","[Web, Development]","[CSS, Animations:, Create, Amazing, Effects, o..."


In [87]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [88]:
new_df.head()

Unnamed: 0,course_title,subject,tags
0,"[Ultimate, Investment, Banking, Course]","[Business, Finance]",Ultimate Investment Banking Course Business Fi...
1,"[Complete, GST, Course, &, Certification, -, G...","[Business, Finance]",Complete GST Course & Certification - Grow You...
2,"[Financial, Modeling, for, Business, Analysts,...","[Business, Finance]",Financial Modeling for Business Analysts and C...
3,"[Beginner, to, Pro, -, Financial, Analysis, in...","[Business, Finance]",Beginner to Pro - Financial Analysis in Excel ...
4,"[How, To, Maximize, Your, Profits, Trading, Op...","[Business, Finance]",How To Maximize Your Profits Trading Options B...


In [89]:
new_df['tags'][0]

'Ultimate Investment Banking Course Business Finance'

Suggested to lowercase whole data

In [90]:
new_df['tags']= new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x:x.lower())


In [91]:
new_df.head()

Unnamed: 0,course_title,subject,tags
0,"[Ultimate, Investment, Banking, Course]","[Business, Finance]",ultimate investment banking course business fi...
1,"[Complete, GST, Course, &, Certification, -, G...","[Business, Finance]",complete gst course & certification - grow you...
2,"[Financial, Modeling, for, Business, Analysts,...","[Business, Finance]",financial modeling for business analysts and c...
3,"[Beginner, to, Pro, -, Financial, Analysis, in...","[Business, Finance]",beginner to pro - financial analysis in excel ...
4,"[How, To, Maximize, Your, Profits, Trading, Op...","[Business, Finance]",how to maximize your profits trading options b...


# Text_Vectorization

In [92]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [93]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [94]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [95]:
# new_df['tags'][0]

In [96]:
cv.get_feature_names_out()

array(['00005', '001', '01', ..., '１週間でホームページのトップ画像を製作できるようになる画像製作講座',
       '６時間でインターバンク市場を攻略', '７日でマスター'], dtype=object)

## Stemming - Basically finding the Root Word

In [97]:
# The idea is
# ['loved', 'loving', 'love'] => ['love', 'love', 'love']

In [98]:
! pip install nltk



In [99]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [100]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [101]:
 new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [102]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [103]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [104]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [105]:
cv.get_feature_names_out()

array(['00005', '001', '01', ..., '１週間でホームページのトップ画像を製作できるようになる画像製作講座',
       '６時間でインターバンク市場を攻略', '７日でマスター'], dtype=object)

## Now Using Cosine Similarity to find the Distance between two vectors

In [106]:
from sklearn.metrics.pairwise import cosine_similarity

In [107]:
similarity = cosine_similarity(vectors)

In [108]:
similarity[0] # Printing the Similairty Matrix for 0th Vector

array([1.        , 0.40824829, 0.40824829, ..., 0.        , 0.        ,
       0.        ])

In [112]:
new_df[new_df['course_title'] == 'ultim invest bank cours busi financ']

Unnamed: 0,course_title,subject,tags


In [110]:
# Driver Method
def recommend(course):
  course_index = new_df[new_df['course_title'] == course].index[0]
  distances = similarity[course_index]
  courses_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

  for i in courses_list:
    # print(new_df.iloc[i[0]].course_title)
    print(i[0])

In [111]:
recommend('ultim invest bank cours busi financ')

IndexError: index 0 is out of bounds for axis 0 with size 0

## Connection Code

In [None]:
# import pickle

In [None]:
# pickle.dump(new_df,open('movies.pkl', 'wb'))
# pickle.dump(new_df.to_dict(),open('course_dict.pkl', 'wb'))

In [None]:
# dumping that similarity array if each vector
# pickle.dump(similarity,open('similarity.pkl','wb'))