In [2]:
import seaborn as sns
import plotly.express as px
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
# next step: load the dataset
# merge data
# choose necessary columns, like in movies recommentaton we chose only 3 columns id, title, overview
# now convert your text column into numerical using TfidfVectorizer 

> TF-IDF basically is Term Frequency-Inverse Document frequency. The number of features it creates is equal to the total number of distinct words used in the overview column and the values are directly proportional to the number of times a particular word is used and inversely proportional to the number of documents (movies here) in which the word is used. It will penalize a word even though a word has a huge number for a movie but is common to many movies. The words which occur multiple times but are common to many movies are anyways not so helpful in differentiating different movies.**

    tfidf = TfidfVectorizer(stop_words=’english’)
    movies[‘overview’] = movies['overview'].fillna('')
    
    #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
    overview_matrix = tfidf.fit_transform(movies[‘overview’])
    
    #Output the shape of tfidf_matrix
    overview_matrix.shape
    #Output
    (45466, 75827)

    Now, we have a ‘tfidf’ feature matrix for all the movies. Every movie has 75927 number of features (words ). Now, in order to find the similarity between the movies, we will use the cosine_similarity. In our case, the linear_kernel function will compute the same for us.

    Cosine_Similarity is basically a measure of the similarity between 2 vectors. This measure is the cosine of the angle between them. Here, we have 75927 features (tfidf values) for each movie. Let us now find the similarity matrix using linear_kernel function:

    similarity_matrix = linear_kernel(overview_matrix,overview_matrix)
    similarity_matrix

    Now, let us create a series that maps the index of the matrix to movie names to make it easy for us to just feed in movie names and get the recommendation.
    
    #movies index mapping
    mapping = pd.Series(movies.index,index = movies[‘title’])
    mapping
    
    def recommend_movies_based_on_plot(movie_input):
    movie_index = mapping[movie_input]
    
    #get similarity values with other movies
    #similarity_score is the list of index and similarity matrix
    
    similarity_score = list(enumerate(similarity_matrix[movie_index]))
    
    #sort in descending order the similarity score of movie inputted with all the other movies
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 15 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:15]
    
    #return movie names using the mapping series
    movie_indices = [i[0] for i in similarity_score]
    return (movies[‘title’].iloc[movie_indices])
    
    Lets now try to get a recommendation for the movie ‘Life Begins for Andy Hardy’ from the above recommendation function and see what it outputs.
    
    recommend_movies_based_on_plot(‘Life Begins for Andy Hardy’)

In [1]:
# 

In [3]:
sample=pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv')
topics=pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/topics.csv')
correlations=pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv')
content=pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/content.csv')

In [4]:
sample.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


In [5]:
sample.shape

(5, 2)

In [12]:
correlations.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [13]:
correlations.shape

(61517, 2)

In [6]:
topics.head()

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True


In [7]:
topics.shape

(76972, 9)

In [10]:
content.head()

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA


In [11]:
content.shape

(154047, 8)

In [14]:
# chekcing for null values in each dataset

print('Null values in Sample Submission: ',sample.isnull().sum().sum())
print('Null values in Correlations: ',correlations.isnull().sum().sum())
print('Null values in Topics: ',topics.isnull().sum().sum())
print('Null values in Content: ',content.isnull().sum().sum())

Null values in Sample Submission:  0
Null values in Correlations:  0
Null values in Topics:  42192
Null values in Content:  306850


In [15]:
# Lets explore more on Topics & Content

content.isnull().sum()

id                      0
title                   9
description         64591
kind                    0
text                80012
language                0
copyright_holder    82226
license             80012
dtype: int64

In [16]:
topics.isnull().sum()

id                 0
title              2
description    42019
channel            0
category           0
level              0
language           0
parent           171
has_content        0
dtype: int64