In [2]:
pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
Installing collected packages: neattext
Successfully installed neattext-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
data = pd.read_csv('udemy_courses.csv')

### Notes:

The `data.columns` command is used to retrieve the column labels of the dataset. The result is a list of column names, providing an overview of the different features or variables present in the dataset.

This information is useful for understanding the structure of the dataset and can be used as a reference when working with specific columns during data analysis, visualization, or feature engineering.

In [5]:
data.columns


Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

In [6]:
data.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance


In [7]:
data.isnull().sum()


course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

### Notes:

The `data.isnull().sum()` command is used to count the number of missing values in each column of the dataset. It returns a Series where the index represents the column names, and the corresponding values indicate the count of missing values in each column.

This is a crucial step in data preprocessing as it helps identify columns with missing data, allowing for informed decisions on how to handle or impute missing values. Understanding the extent of missing data is essential for maintaining the quality and integrity of the dataset during analysis and modeling.


In the output, the values indicate the number of missing values in each corresponding column.

In [8]:
data.duplicated().any()



True

### Notes:

The `data.duplicated().any()` command is used to check if there are any duplicate rows in the dataset. It returns a Boolean value – `True` if duplicates are found, and `False` otherwise.

Identifying and handling duplicate rows is crucial in ensuring data quality, especially in scenarios where each row is expected to represent unique observations. Detecting duplicates allows for further investigation and potential removal to prevent skewed analyses or modeling.

In [9]:
data[data.duplicated()]


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [10]:
data = data.drop_duplicates()


In [11]:
data.shape


(3672, 12)

In [12]:
data['course_title']


0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3673    Learn jQuery from Scratch - Master of JavaScri...
3674    How To Design A WordPress Website With No Codi...
3675                        Learn and Build using Polymer
3676    CSS Animations: Create Amazing Effects on Your...
3677    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3672, dtype: object

In [13]:
data.columns


Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

### Popularity-based recommendation system


In [15]:
def popularity_based_recommendation(df,top_n=5):
    # Calculate popularity score for each course
    data['popularity_score'] = 0.6 * data['num_subscribers'] + 0.4 * data['num_reviews']
    
    # Sort courses by popularity score in descending order
    df_sorted = data.sort_values(by='popularity_score', ascending=False)
    
    # Return the recommended courses (course titles and popularity scores)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    
    return recommended_courses

Here's an explanation of each line in the `popularity_based_recommendation` function:

1. **Function Definition:**
   - `def popularity_based_recommendation(df, top_n=5):`
     - Defines a function named `popularity_based_recommendation` that takes a DataFrame `df` as input and an optional parameter `top_n` with a default value of 5.

2. **Calculate Popularity Score:**
   - `data['popularity_score'] = 0.6 * data['num_subscribers'] + 0.4 * data['num_reviews']`
     - Calculates a popularity score for each course based on a weighted combination of the number of subscribers and the number of reviews. The weights are 0.6 and 0.4, respectively.

3. **Sort Courses by Popularity Score:**
   - `df_sorted = data.sort_values(by='popularity_score', ascending=False)`
     - Sorts the DataFrame `data` based on the calculated popularity scores in descending order. The resulting DataFrame is stored in `df_sorted`.

4. **Return Recommended Courses:**
   - `recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)`
     - Extracts the columns 'course_title' and 'popularity_score' from the sorted DataFrame, selecting the top `top_n` courses with the highest popularity scores.

5. **Return Statement:**
   - `return recommended_courses`
     - Returns the DataFrame `recommended_courses`, which contains the titles and popularity scores of the recommended courses.

This function essentially provides recommendations based on the popularity score of courses, prioritizing those with higher subscriber counts and reviews. The user can specify the number of top courses they want to receive recommendations for using the `top_n` parameter.



**Explanation:**
- For each course, the popularity score is calculated using the formula: `0.6 * num_subscribers + 0.4 * num_reviews`.
- Let's take 'Course A' as an example:
  - `0.6 * 100 + 0.4 * 50 = 60 + 20 = 80`
  - So, the popularity score for 'Course A' is 80.0.
- Similarly, the calculation is performed for 'Course B' and 'Course C'.
- The resulting DataFrame includes the original columns ('course_title', 'num_subscribers', 'num_reviews') and a new column ('popularity_score') containing the calculated scores.

In this example, the weights of 0.6 and 0.4 represent the relative importance of subscribers and reviews in determining the popularity score. The courses are then sorted based on their popularity scores to provide recommendations.

In [16]:
popularity_based_recommendation(data)


Unnamed: 0,course_title,popularity_score
2827,Learn HTML5 Programming From Scratch,164805.4
3032,Coding for Entrepreneurs Basic,96729.0
3230,The Web Developer Bootcamp,83928.4
3232,The Complete Web Developer Course 2.0,77672.0
2783,Build Your First Website in 1 Week with HTML5 ...,74544.2


### Content-Based Recommendation System


In [17]:
data['course_title'] = data['course_title'].apply(nfx.remove_stopwords)
data['course_title']  =data['course_title'].apply(nfx.remove_special_characters)

Here's an explanation of each line:

```python
# Apply the remove_stopwords function to each element in the 'course_title' column
data['course_title'] = data['course_title'].apply(nfx.remove_stopwords)
```

1. **`data['course_title']`:**
   - Selects the 'course_title' column in the DataFrame `data`.

2. **`.apply(nfx.remove_stopwords)`:**
   - Applies the `remove_stopwords` function from the `nfx` (natural language processing extensions) library to each element in the 'course_title' column.
   - The `remove_stopwords` function is likely designed to remove common stopwords (common words like "the," "and," etc.) from text.

```python
# Apply the remove_special_characters function to each element in the 'course_title' column
data['course_title'] = data['course_title'].apply(nfx.remove_special_characters)
```

3. **`data['course_title']`:**
   - Again, selects the 'course_title' column in the DataFrame `data`.

4. **`.apply(nfx.remove_special_characters)`:**
   - Applies the `remove_special_characters` function from the `nfx` library to each element in the 'course_title' column.
   - The `remove_special_characters` function is likely designed to remove special characters (e.g., punctuation, symbols) from text.

**Overall Explanation:**
- The two lines of code are part of a data preprocessing step for the 'course_title' column.
- The `remove_stopwords` function is applied first to remove common stopwords from each course title.
- The `remove_special_characters` function is then applied to remove any remaining special characters in the course titles.
- These operations are commonly performed in natural language processing (NLP) tasks to clean and preprocess text data before further analysis or modeling. The goal is to enhance the quality of text data by removing irrelevant or noisy elements.

In [18]:
data.sample(5)


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,popularity_score
2856,708558,Learn HTML Basics Scratch,https://www.udemy.com/learn-html-from-scratch/,False,0,5004,52,21,All Levels,3.0,2015-12-29T19:01:04Z,Web Development,3023.2
2964,964118,Bootstrap basics beginners,https://www.udemy.com/bootstrap-basics-for-beg...,True,40,1152,20,14,All Levels,1.0,2016-09-21T14:49:10Z,Web Development,699.2
2486,965528,Web Development Masterclass Complete Certific...,https://www.udemy.com/web-development-mastercl...,True,200,4090,178,348,All Levels,19.5,2016-10-12T03:14:13Z,Web Development,2525.2
1942,311530,Learn Play Blues Rock Boogie Woogie Piano Today,https://www.udemy.com/learn-how-to-play-blues-...,True,45,2346,11,14,All Levels,0.65,2014-10-08T22:07:46Z,Musical Instruments,1412.0
1294,15611,Discover 7 Secrets Figure Drawing Draw Awesome,https://www.udemy.com/how-to-draw-the-human-fi...,True,40,2634,92,25,All Levels,10.0,2012-04-04T02:24:51Z,Graphic Design,1617.2


In [19]:
data['title_subject']  =data['course_title'] +' '+data['subject']


Cosine similarity is a measure of similarity between two vectors in a multidimensional space. It is a metric used to measure the similarity between two vectors by calculating the cosine of the angle between the two vectors. 

The cosine similarity between two vectors (A and B) is calculated as follows:

Cosine Similarity(A, B) = (A.B)/(||A|| ||B||)

where:

* A and B are the two vectors being compared
* A.B is the dot product of A and B
* ||A|| and ||B|| are the magnitudes of A and B, respectively

The cosine similarity ranges from -1 to 1. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that the two vectors are pointing in opposite directions. A value of 0 indicates that the two vectors are orthogonal.

Cosine similarity is used in a variety of applications, including:

* Information retrieval: Cosine similarity is used to measure the similarity between documents or queries in order to return relevant results to users. For example, a search engine might use cosine similarity to rank documents based on how similar they are to a user's query.
* Natural language processing: Cosine similarity is used to measure the similarity between words or phrases in order to perform tasks such as machine translation and text summarization.
* Recommender systems: Cosine similarity is used to recommend products or services to users based on their past purchases or browsing history. For example, an online retailer might use cosine similarity to recommend products to a user that are similar to products that the user has purchased in the past.

In [20]:
cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(data['title_subject']).toarray()

In [21]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

1. **Vectorizing the Titles:**
   - Each book title is transformed into a numerical format using a technique called "CountVectorizer."
   - CountVectorizer converts the text into a set of numbers, where each number represents the frequency of a word in the title.
   - For example, "Introduction to Python" might become [1, 1, 0, 0, 1, 0, 0, ...] based on the words it contains.

2. **Creating Vectors:**
   - All the book titles are represented as vectors in a high-dimensional space.
   - Each coordinate in the vector corresponds to a unique word in the entire set of titles.

3. **Calculating Cosine Similarity:**
   - Cosine similarity measures the cosine of the angle between two vectors.
   - The formula for cosine similarity between vectors A and B is:
     \[ \text{Cosine Similarity} = \frac{A \cdot B}{\|A\| \cdot \|B\|} \]
   - In simpler terms, it's the dot product of the vectors divided by the product of their magnitudes (lengths).

4. **Interpreting the Score:**
   - The result is a score between 0 and 1.
   - If the score is 1, the vectors point in the same direction, meaning the titles are identical.
   - If the score is 0, the vectors are orthogonal (at a right angle), indicating no similarity.

So, the logic is to calculate how much the vectors representing two book titles align with each other. The more aligned they are, the higher the cosine similarity score, suggesting higher similarity in terms of the words they contain. This score helps identify books that are conceptually closer or share common themes based on the words in their titles.

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

# List of book titles
book_titles = [
    "Introduction to Python Programming",
    "Python Crash Course",
    "Advanced Python Concepts",
    "Data Science with Python",
    "Machine Learning Essentials",
    "Deep Learning Basics",
    "Web Development with Django",
    "JavaScript for Beginners",
    "HTML and CSS Fundamentals",
    "Algorithms and Data Structures"
]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Transform the titles into vectors
title_vectors = vectorizer.fit_transform(book_titles)

# Get the feature names (words) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Display the vectors as a DataFrame for better visualization
import pandas as pd
df = pd.DataFrame(title_vectors.toarray(), columns=feature_names)
df.index = book_titles

# Display the DataFrame
print(df)


                                    advanced  algorithms  and  basics  \
Introduction to Python Programming         0           0    0       0   
Python Crash Course                        0           0    0       0   
Advanced Python Concepts                   1           0    0       0   
Data Science with Python                   0           0    0       0   
Machine Learning Essentials                0           0    0       0   
Deep Learning Basics                       0           0    0       1   
Web Development with Django                0           0    0       0   
JavaScript for Beginners                   0           0    0       0   
HTML and CSS Fundamentals                  0           0    1       0   
Algorithms and Data Structures             0           1    1       0   

                                    beginners  concepts  course  crash  css  \
Introduction to Python Programming          0         0       0      0    0   
Python Crash Course                   

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between vectors
cosine_sim_matrix = cosine_similarity(title_vectors)

# Display the cosine similarity matrix as a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=book_titles, columns=book_titles)
print(cosine_sim_df)


                                    Introduction to Python Programming  \
Introduction to Python Programming                            1.000000   
Python Crash Course                                           0.288675   
Advanced Python Concepts                                      0.288675   
Data Science with Python                                      0.250000   
Machine Learning Essentials                                   0.000000   
Deep Learning Basics                                          0.000000   
Web Development with Django                                   0.000000   
JavaScript for Beginners                                      0.000000   
HTML and CSS Fundamentals                                     0.000000   
Algorithms and Data Structures                                0.000000   

                                    Python Crash Course  \
Introduction to Python Programming             0.288675   
Python Crash Course                            1.000000   
Advanced

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. In the context of vectorized text data, like the vectors we obtained from the book titles using CountVectorizer, cosine similarity measures the cosine of the angle between two vectors. The formula for cosine similarity between vectors \(A\) and \(B\) is given by:

$[ \text{cosine\_similarity}(A, B) = \frac{A \cdot B}{\|A\| \cdot \|B\|}]$

Here:
- $(A \cdot B)$ is the dot product of vectors \(A\) and \(B\),
- $(\|A\|)$ is the Euclidean norm (magnitude) of vector \(A\), and
- $(\|B\|)$ is the Euclidean norm (magnitude) of vector \(B\).

In the case of book titles, each book is represented as a vector, and the cosine similarity between two book vectors is computed using this formula. The result is a similarity score ranging from -1 to 1, where 1 indicates perfect similarity, 0 indicates no similarity, and -1 indicates perfect dissimilarity.

Above code calculates the cosine similarity between the vectors representing the book titles using the Python `cosine_similarity` function from the `sklearn.metrics.pairwise` module


This matrix will show the cosine similarity between each pair of book titles. Higher values indicate greater similarity.

In [22]:
len(cv.get_feature_names())

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [23]:
from sklearn.metrics.pairwise import cosine_similarity


In [24]:
similarity = cosine_similarity(vectors)


In a book recommendation system, cosine similarity can be employed to measure the similarity between books based on their content or features. Here's how cosine similarity can be utilized in a book recommendation system:

1. **Feature Representation:**
   - Represent each book as a vector in a high-dimensional space, where each dimension corresponds to a feature or attribute of the book.
   - Features may include terms from the book's description, author information, genre, or any other relevant metadata.

2. **Vectorization:**
   - Use techniques like TF-IDF (Term Frequency-Inverse Document Frequency) or CountVectorizer to convert textual data (e.g., book descriptions) into numerical vectors.
   - The resulting vectors represent the content of each book in the feature space.

3. **Cosine Similarity Calculation:**
   - Calculate the cosine similarity between pairs of book vectors using the cosine similarity formula.
   - The higher the cosine similarity between two books, the more similar they are in terms of their content.

4. **Recommendation Generation:**
   - Given a user's preferences or the books they have liked, identify books with high cosine similarity to the user's preferences.
   - Recommend books that are similar to the ones the user has shown interest in.

5. **Personalization:**
   - Provide personalized recommendations by considering the cosine similarity scores for a user's entire set of preferences.
   - Users with similar tastes will receive similar book recommendations.

6. **Cold Start Problem:**
   - Address the "cold start" problem (new books or users with limited data) by recommending books based on content similarity until there is enough user interaction data to make collaborative recommendations.

Cosine similarity helps in capturing the semantic similarity between books, allowing the recommendation system to suggest books that are contextually similar to the ones a user has shown interest in. It is particularly useful when dealing with a large catalog of books and when the recommendation is based on textual content or features.

In [25]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]


[(39, 0.7715167498104596),
 (240, 0.6666666666666669),
 (417, 0.6666666666666669),
 (418, 0.6172133998483676),
 (657, 0.6172133998483676)]

In [26]:
def recommend(course):
    # let's featch the index
    course_index = data[data['course_title']==course].index[0]
    distances = similarity[course_index]
    courses_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in courses_list:
        print(data.iloc[i[0]]['course_title'])

In [27]:
recommend("know HTML Learn HTML Basics")


WordPress Development Beginners
Wordpress Theme Development Beginners
Wordpress beginners Build Websites Fast Coding
Website Coding WordPress  Web Skills
Kids Coding  Beginners CSS


In [28]:
data.iloc[39]['course_title']


'Complete Investment Banking Course 2017'

In [29]:
import pickle


In [30]:
#pickle.dump(data.to_dict(),open('course_dict.pkl','wb'))
pickle.dump(data,open('course_dict.pkl','wb'))

In [31]:

pickle.dump(similarity,open('similarity.pkl','wb'))

In [32]:
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd

# Assume 'data', 'similarity' are defined elsewhere

# Define the popularity-based recommendation function
def popularity_based_recommendation(df, top_n=5):
    df['popularity_score'] = 0.6 * df['num_subscribers'] + 0.4 * df['num_reviews']
    df_sorted = df.sort_values(by='popularity_score', ascending=False)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    return recommended_courses

# Define the recommend function
def recommend(course):
    try:
        course_index = data[data['course_title'] == course].index[0]
        distances = similarity[course_index]
        courses_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        recommended_courses = [data.iloc[i[0]]['course_title'] for i in courses_list]
        return recommended_courses
    except IndexError:
        messagebox.showerror("Error", f"Course '{course}' not found.")

# Event handler for the "Recommend" button
def recommend_button_click():
    course_title = course_var.get()
    recommended_courses = recommend(course_title)
    if recommended_courses:
        popularity_label.pack_forget()
        result_label.config(text="Recommended Courses:\n" + '\n'.join(recommended_courses))

# Create the main application window
root = tk.Tk()
root.title("Course Recommender")
root.geometry("400x300")

# Change font and color
font_style = ("Arial", 12)
label_color = "blue"
heading_color="red"
button_color = "green"
result_label_color = "black"

# Create and place GUI elements
label = tk.Label(root, text="Select Course:", font=font_style, fg=label_color)
label.pack(pady=10)

course_titles = data['course_title'].tolist()
course_var = tk.StringVar(value=course_titles[0])
course_dropdown = ttk.Combobox(root, textvariable=course_var, values=course_titles, width=40, font=font_style)
course_dropdown.pack(pady=5)

popularity_recommendations = popularity_based_recommendation(data, top_n=5)
popularity_label = tk.Label(root, text="Popularity-based Recommendations:\n" + popularity_recommendations.to_string(index=False),
                             font=font_style, fg=label_color)
popularity_label.pack()

recommend_button = tk.Button(root, text="Recommend", command=recommend_button_click, width=20, font=font_style, fg=button_color)
recommend_button.pack(pady=10)

result_label = tk.Label(root, text="", wraplength=350, font=font_style, fg=result_label_color)
result_label.pack()

root.mainloop()