### Import of some necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [2]:
warnings.filterwarnings('ignore')

### I- Familiarization with the dataset

In [3]:
# Load the dataset
df = pd.read_csv("datas/Coursera.csv")
df.head()

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


### Basic data investigation

In [4]:
print("Size of dataset: ", df.shape)

Size of dataset:  (3522, 7)


In [5]:
print("Number of duplicated rows: ", df.duplicated().sum())

Number of duplicated rows:  98


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522 entries, 0 to 3521
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Course Name         3522 non-null   object
 1   University          3522 non-null   object
 2   Difficulty Level    3522 non-null   object
 3   Course Rating       3522 non-null   object
 4   Course URL          3522 non-null   object
 5   Course Description  3522 non-null   object
 6   Skills              3522 non-null   object
dtypes: object(7)
memory usage: 192.7+ KB


Nikel: There are no missing values in any of the columns

### II- Data cleaning

In [7]:
# Drop duplicated rows
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [8]:
df.columns

Index(['Course Name', 'University', 'Difficulty Level', 'Course Rating',
       'Course URL', 'Course Description', 'Skills'],
      dtype='object')

In [9]:
def rename_col(col_name):
    col_name = col_name.split(' ')
    col_name = '_'.join(col_name)
    return col_name

In [10]:
print("Columns names before renaming: ", df.columns.to_list())
df.columns = [rename_col(col) for col in df.columns]
print("Columns names after renaming: ", df.columns.to_list())

Columns names before renaming:  ['Course Name', 'University', 'Difficulty Level', 'Course Rating', 'Course URL', 'Course Description', 'Skills']
Columns names after renaming:  ['Course_Name', 'University', 'Difficulty_Level', 'Course_Rating', 'Course_URL', 'Course_Description', 'Skills']


### III- Feature selection

Our goal is to make a content-base recommandation system, so we will only retain features related to the content of a course. We will therefore select:
- **Course_Name**: The name of the course
- **Course_Description**: The descrition of the course
- **Skills**: The topics covered in the course 
- **Difficulty_Level**:  The course's level difficulty

In [11]:
features_selected = ["Course_Name", "Course_Description", "Skills", "Difficulty_Level"]

In [12]:
new_df = df[features_selected]
new_df.head()

Unnamed: 0,Course_Name,Course_Description,Skills,Difficulty_Level
0,Write A Feature Length Screenplay For Film Or ...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...,Beginner
1,Business Strategy: Business Model Canvas Analy...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...,Beginner
2,Silicon Thin Film Solar Cells,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...,Advanced
3,Finance for Managers,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...,Intermediate
4,Retrieve Data using Single-Table SQL Queries,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...,Beginner


**Then we combine the content of all these features to form a single feature**

In [13]:
new_df["description_key_words"] = ['' for i in range(new_df.shape[0])]
for col in features_selected:
    new_df["description_key_words"] += [' ' for i in range(new_df.shape[0])] + new_df[col]
new_df.head()

Unnamed: 0,Course_Name,Course_Description,Skills,Difficulty_Level,description_key_words
0,Write A Feature Length Screenplay For Film Or ...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...,Beginner,Write A Feature Length Screenplay For Film Or...
1,Business Strategy: Business Model Canvas Analy...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...,Beginner,Business Strategy: Business Model Canvas Anal...
2,Silicon Thin Film Solar Cells,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...,Advanced,Silicon Thin Film Solar Cells This course con...
3,Finance for Managers,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...,Intermediate,Finance for Managers When it comes to numbers...
4,Retrieve Data using Single-Table SQL Queries,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...,Beginner,Retrieve Data using Single-Table SQL Queries ...


**We maintain now only  **Course_Name** et **description_key_words****

In [14]:
new_df = new_df[["Course_Name", "description_key_words"]]
new_df

Unnamed: 0,Course_Name,description_key_words
0,Write A Feature Length Screenplay For Film Or ...,Write A Feature Length Screenplay For Film Or...
1,Business Strategy: Business Model Canvas Analy...,Business Strategy: Business Model Canvas Anal...
2,Silicon Thin Film Solar Cells,Silicon Thin Film Solar Cells This course con...
3,Finance for Managers,Finance for Managers When it comes to numbers...
4,Retrieve Data using Single-Table SQL Queries,Retrieve Data using Single-Table SQL Queries ...
...,...,...
3517,"Capstone: Retrieving, Processing, and Visualiz...","Capstone: Retrieving, Processing, and Visuali..."
3518,Patrick Henry: Forgotten Founder,Patrick Henry: Forgotten Founder �Give me lib...
3519,Business intelligence and data analytics: Gene...,Business intelligence and data analytics: Gen...
3520,Rigid Body Dynamics,Rigid Body Dynamics This course teaches dynam...


### Data preprocessing

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [16]:
new_df["description_key_words"].iloc[5]

' Building Test Automation Framework using Selenium and TestNG Selenium is one of the most widely used functional UI automation testing tools and TestNG is a brilliant testing framework.  Test automation frameworks are a set of guidelines or rules for writing test cases.  They can reduce maintenance costs and testing efforts and will provide a higher return on investment (ROI) for teams looking to optimize their processes.  Testing guidelines include coding standards, test-data management, defining object repositories, reporting guidelines, and logging strategies.  Through hands-on, practical experience, you will go through concepts writing reusable and structure code which is easy to maintain and understand, creating helper classes or utilities, write effective testcases, and generating reports and logs. maintenance  test case  test automation  screenshot  project  helper class  selenium  reusability  debugging  php computer-science software-development Beginner'

In [17]:
my_lematizer = WordNetLemmatizer()

def PreprocessTexte(text):
    
    cleaned_text = re.sub(r'-',' ',text) 
    
    # remove  urls
    cleaned_text = re.sub(r'https?://\S+|www\.\S+|http?://\S+',' ',cleaned_text) 
    # remove html tags
    cleaned_text = re.sub(r'<.*?>',' ',cleaned_text) 
    # replace all numbers 
    cleaned_text = re.sub(r'[0-9]', '', cleaned_text)
    # filtering out miscellaneous text.
    cleaned_text = re.sub(r"\([^()]*\)", "", cleaned_text)
    # remove mentions
    cleaned_text = re.sub('@\S+', '', cleaned_text)  
    # removes ponctuations
    cleaned_text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', cleaned_text)  

    cleaned_text = re.sub(r'ML',' Machine Learning ',cleaned_text) 
    
    cleaned_text = re.sub(r'DL',' Deep Learning ',cleaned_text)
    
    cleaned_text = cleaned_text.lower() # 
    cleaned_text = cleaned_text.split()

    # apply lematisation
    cleaned_text = ' '.join([my_lematizer.lemmatize(word) for word in cleaned_text])
    
    return cleaned_text

In [18]:
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [19]:
new_df["description_key_words"] = new_df["description_key_words"].apply(PreprocessTexte)

In [20]:
new_df["description_key_words"].iloc[5]

'building test automation framework using selenium and testng selenium is one of the most widely used functional ui automation testing tool and testng is a brilliant testing framework test automation framework are a set of guideline or rule for writing test case they can reduce maintenance cost and testing effort and will provide a higher return on investment for team looking to optimize their process testing guideline include coding standard test data management defining object repository reporting guideline and logging strategy through hand on practical experience you will go through concept writing reusable and structure code which is easy to maintain and understand creating helper class or utility write effective testcases and generating report and log maintenance test case test automation screenshot project helper class selenium reusability debugging php computer science software development beginner'

### Vectorisation

In [21]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
vectors = vectorizer.fit_transform(new_df["description_key_words"]).toarray()

In [22]:
print("Shape of feature  matrix: ", vectors.shape)
print("Vocabulary size : ", len(vectorizer.vocabulary_))
#print("The vocabulary: ", vectorizer.vocabulary_)

Shape of feature  matrix:  (3424, 10000)
Vocabulary size :  10000


### Recommandation

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
def books_id_recommended(description, vectorizer, vectors, number_of_recommendation=5):
    # preprocess text
    description = [PreprocessTexte(description)]
    
    # do vectorization 
    vect = vectorizer.transform(description)

    # compute similarity with other feature vectors
    similars_vectors = cosine_similarity(vect, vectors)[0]

    # We sort the similarity values in ascending order(The result is a list of indices)
    ordered_similars_vectors = list(similars_vectors.argsort()) 

    # We reverse to order
    reverse_ordered_similars_vectors = [index for index in reversed(ordered_similars_vectors)] 
    
    # We select the number_of_recommendation indices corresponding to the highest similarity coeficients
    best_indexs = reverse_ordered_similars_vectors[1:number_of_recommendation]
        
    return best_indexs

In [25]:
def recommend_me():
    description = input("Enter a  description: ")
    books_index = books_id_recommended(description, vectorizer, vectors, number_of_recommendation=5)
    if books_index != None:
        books_to_recommend = list(new_df.iloc[books_index]["Course_Name"])
        print("Books to recommend to user: ")
        print("------------------------------------------------------------------")
        for i, book in enumerate(books_to_recommend):
            print(f"\t{i+1}- {book}")
        print("------------------------------------------------------------------")
    else:
        print("No book to recommend to you")

In [26]:
quit = 'n'
while quit == 'n':
    recommend_me()
    quit = str(input("Exit ? y(yes) n(no)")).lower()

Enter a  description:  Django for web


Books to recommend to user: 
------------------------------------------------------------------
	1- Building Web Applications in Django
	2- Django Features and Libraries
	3- Introduction to Web Development
	4- Web Application Development: Basic Concepts
------------------------------------------------------------------


Exit ? y(yes) n(no) n
Enter a  description:  Python programming


Books to recommend to user: 
------------------------------------------------------------------
	1- Python Programming Essentials
	2- Python Data Representations
	3- Python Basics
	4- Python Programming: A Concise Introduction
------------------------------------------------------------------


Exit ? y(yes) n(no) y
