In [1]:
%pwd

'x:\\CampusX\\projects\\movie_recommendation_project\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'x:\\CampusX\\projects\\movie_recommendation_project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [8]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    transformed_data_path: Path

In [9]:
from src.movie_recommendation_project.constants import *
from src.movie_recommendation_project.utils.common import read_yaml,create_directories

In [15]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)->DataTransformationConfig:
        config=self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            transformed_data_path=config.transformed_data_path

        )
        return data_transformation_config



In [16]:
import pandas as pd
import ast

In [48]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config=config
        self.df=pd.read_csv(self.config.data_path)


    def str_lst(self,x):
        return ast.literal_eval(x)
    
    def prepare_list(self,x):
        l=[]
        for i in x:
            l.append(i.lower().replace(" ",""))
        return " ".join(l)
    
    def puntuation_remover(self,x):
        punc = '''!()-[]}{;:'"\,<>./?@#$%^&*_~'''
        for ele in x:
            if ele in punc:
                x = x.replace(ele, " ")
        return x
    
    def prepare_words(self,x):
        l=[]
        ls=x.split(" ")
        for i in ls:
            l.append(i.lower())
        return " ".join(l)
    
    def transform_data(self):
        self.df.drop("ID",axis=1,inplace=True)
        new_names={
                    "Movie Name":"title",
                    "Rating":"rating",
                    "Votes":"vote",
                    "Directors":"director",
                    "Stars":"star",
                    "Metascore":"metascore",
                    "Genre":"genre",
                    "Plot":"plot",
                    "Runtime":"runtime",
                    "Gross":"gross",
                    "Link":"link"

                }
        self.df.rename(columns=new_names,inplace=True)

        self.df["runtime"]=self.df["runtime"].str.replace(" min","")
        self.df["genre"]=self.df["genre"].str.split(",")

        self.df["director"]=self.df["director"].apply(self.str_lst)
        self.df["star"]=self.df["star"].apply(self.str_lst)
        self.df["director"]=[i[0] for i in self.df["director"]]
        self.df["star"][287]=["Sara Cushman", "Don Hertzfeldt"]

        self.df.drop([13,27],inplace=True)
        self.df.reset_index(drop=True,inplace=True)

        self.df["year"]=self.df["year"].astype(str)
        self.df=self.df[self.df["year"].str.isdigit()]
        self.df["year"]=self.df["year"].astype(int)
        self.df=self.df[self.df["year"]>1800]

        #self.df["genre"]=self.df["genre"].apply(self.str_lst)
        self.df["writer"]=self.df["writer"].apply(self.str_lst)

        for col in ["genre","star","writer"]:
            self.df[col]=self.df[col].apply(self.prepare_list)

        self.df["plot"]=self.df["plot"].apply(self.puntuation_remover)

        self.df["director"]=self.df["director"].str.replace(" ","").str.lower()

        self.df["plot"]=self.df["plot"].apply(self.prepare_words)

        self.df.to_csv(self.config.transformed_data_path,index=False)


  punc = '''!()-[]}{;:'"\,<>./?@#$%^&*_~'''


In [49]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transform_data()
except Exception as e:
    raise e

[2024-08-30 12:55:14,694: INFO :common :yaml file: config\config.yaml loaded successfully]
[2024-08-30 12:55:14,698: INFO :common :yaml file: params.yaml loaded successfully]
[2024-08-30 12:55:14,698: INFO :common :created directory at: artifacts]
[2024-08-30 12:55:14,703: INFO :common :created directory at: artifacts/data_transformation]


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.df["star"][287]=["Sara Cushman", "Don Hertzfeldt"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df