In [1]:
%pwd


'x:\\CampusX\\projects\\movie_recommendation_project\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'x:\\CampusX\\projects\\movie_recommendation_project'

In [4]:
from dataclasses import dataclass
from pathlib import Path

In [5]:
@dataclass(frozen=True)
class ModelBuildingConfig:
    root_dir: Path
    data_path: Path
    model_path: Path

In [6]:
from src.movie_recommendation_project.constants import *
from src.movie_recommendation_project.utils.common import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_building_config(self)->ModelBuildingConfig:
        config=self.config.model_building
        create_directories([config.root_dir])
        model_building_config=ModelBuildingConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path

        )
        return model_building_config



In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import pickle

In [17]:
class ModelBuilding:
    def __init__(self,config: ModelBuildingConfig):
        self.config=config
        self.df=pd.read_csv(config.data_path)

    def model_builder(self):
        plot_sentences = [plot.split() for plot in self.df['plot']]
        word2vec_model = Word2Vec(sentences=plot_sentences, vector_size=100, window=5, min_count=1)
        plot_vectors = np.array([np.mean([word2vec_model.wv[word] for word in words], axis=0) for words in plot_sentences])

        # 1. CountVectorizer on genre, director, star and writer
        # Initialize separate CountVectorizer instances for each column
        genre_vectorizer = CountVectorizer()
        director_vectorizer = CountVectorizer()
        star_vectorizer = CountVectorizer()
        writer_vectorizer = CountVectorizer()
        scaler=MinMaxScaler()

        # Vectorize each categorical column separately
        genre_vectors = genre_vectorizer.fit_transform(self.df['genre']).toarray()
        director_vectors = director_vectorizer.fit_transform(self.df['director']).toarray()
        star_vectors = star_vectorizer.fit_transform(self.df['star']).toarray()
        writer_vectors = writer_vectorizer.fit_transform(self.df['writer']).toarray()

        # Scale the numerical 'year' column
        year_scaled = scaler.fit_transform(self.df[['year']])

        combined_vectors = np.hstack((plot_vectors,genre_vectors, director_vectors, star_vectors, writer_vectors, year_scaled))
        similarity = cosine_similarity(combined_vectors)

        pickle.dump(similarity,open(self.config.model_path,"wb"))



In [19]:
try:
    config = ConfigurationManager()
    model_building_config = config.get_model_building_config()
    model_building = ModelBuilding(config=model_building_config)
    model_building.model_builder()
except Exception as e:
    raise e

[2024-08-30 13:31:01,772: INFO :common :yaml file: config\config.yaml loaded successfully]
[2024-08-30 13:31:01,774: INFO :common :yaml file: params.yaml loaded successfully]
[2024-08-30 13:31:01,774: INFO :common :created directory at: artifacts]
[2024-08-30 13:31:01,786: INFO :common :created directory at: artifacts/model_building]
[2024-08-30 13:31:02,176: INFO :word2vec :collecting all words and their counts]
[2024-08-30 13:31:02,176: INFO :word2vec :PROGRESS: at sentence #0, processed 0 words, keeping 0 word types]
[2024-08-30 13:31:02,241: INFO :word2vec :collected 19492 word types from a corpus of 262447 raw words and 9800 sentences]
[2024-08-30 13:31:02,255: INFO :word2vec :Creating a fresh vocabulary]
[2024-08-30 13:31:02,342: INFO :utils :Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 19492 unique words (100.00% of original 19492, drops 0)', 'datetime': '2024-08-30T13:31:02.342669', 'gensim': '4.3.3', 'python': '3.12.4 | packaged by Anaconda, Inc. | (main, Ju