In [1]:
import pandas as pd

In [2]:
file_path = '/content/imdb_top_1000.csv'
movies_df = pd.read_csv(file_path)

In [3]:
print(movies_df.info())
print(movies_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB
None
                                

In [5]:
relevant_columns = ['Series_Title', 'Genre', 'Director', 'IMDB_Rating']
movies_df = movies_df[relevant_columns]

In [6]:
print(movies_df.head())

               Series_Title                 Genre              Director  \
0  The Shawshank Redemption                 Drama        Frank Darabont   
1             The Godfather          Crime, Drama  Francis Ford Coppola   
2           The Dark Knight  Action, Crime, Drama     Christopher Nolan   
3    The Godfather: Part II          Crime, Drama  Francis Ford Coppola   
4              12 Angry Men          Crime, Drama          Sidney Lumet   

   IMDB_Rating  
0          9.3  
1          9.2  
2          9.0  
3          9.0  
4          9.0  


In [7]:
movies_df['Genre'].fillna('Unknown', inplace=True)
movies_df['Director'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['Genre'].fillna('Unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['Genre'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method

In [10]:
movies_df.drop_duplicates(subset=['Series_Title'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df.drop_duplicates(subset=['Series_Title'], inplace=True)


In [11]:
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Series_Title  999 non-null    object 
 1   Genre         999 non-null    object 
 2   Director      999 non-null    object 
 3   IMDB_Rating   999 non-null    float64
dtypes: float64(1), object(3)
memory usage: 39.0+ KB
None


In [12]:
import numpy as np

In [13]:
top_100_movies = movies_df.sample(100, random_state=42)

In [14]:
user_ratings = {
    'User_1': np.random.randint(1, 6, size=100),
    'User_2': np.random.randint(1, 6, size=100),
    'User_3': np.random.randint(1, 6, size=100),
}

In [17]:
ratings_df = pd.DataFrame(user_ratings)
ratings_df['Series_Title'] = top_100_movies['Series_Title'].values

In [19]:
rated_movies = pd.merge(ratings_df, movies_df, on='Series_Title')

In [20]:
print(rated_movies.head())

   User_1  User_2  User_3                 Series_Title  \
0       1       4       1  The Best Years of Our Lives   
1       1       1       5    Hedwig and the Angry Inch   
2       1       2       3                    Gone Girl   
3       1       3       1                The Red Shoes   
4       1       3       1              Le Petit Prince   

                         Genre               Director  IMDB_Rating  
0          Drama, Romance, War          William Wyler          8.0  
1         Comedy, Drama, Music  John Cameron Mitchell          7.7  
2     Drama, Mystery, Thriller          David Fincher          8.1  
3        Drama, Music, Romance         Michael Powell          8.1  
4  Animation, Adventure, Drama           Mark Osborne          7.7  


In [21]:
movies_df.to_csv('preprocessed_movies.csv', index=False)
rated_movies.to_csv('user_rated_movies.csv', index=False)

In [22]:
print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!
