In [4]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv('imdb_top_1000.csv')

# 2. Select relevant features
selected_features = ['Series_Title', 'Genre', 'Director', 
                    'Overview', 'IMDB_Rating']

movies_df = df[selected_features].copy()

# 3. Handle missing values
# Drop rows with missing values in critical fields
movies_df.dropna(subset=['Series_Title', 'Genre', 'Director', 
                         'Overview', 'IMDB_Rating'], 
                inplace=True)

# 4. Clean text data
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return text

# Apply cleaning to text columns
text_columns = ['Series_Title', 'Genre', 'Director','Overview']

for col in text_columns:
    movies_df[col] = movies_df[col].apply(clean_text)

# Select final columns
final_columns = ['Series_Title', 'Genre', 'Director', 
                'Overview', 'IMDB_Rating']

movies_df = movies_df[final_columns]



In [5]:
def movies_you_might_like(title) -> dict:
    if title and len(title) > 4:
        title_filter =movies_df['Series_Title'].str.contains(title, case=False,na=False)
        matched_movies = movies_df[title_filter]
        if matched_movies.empty:
            return {'error': 'No movies found with that title try to use the movie genre'}
        #getting genres from the first matching movie
        input_movie = matched_movies.iloc[0]
        input_genre = set(input_movie['Genre'].split(', '))
        
        other_movies = movies_df[~title_filter].copy()
        other_movies['genres_overlap']= other_movies['Genre'].apply(
            lambda x:len(set(x.split(', ')) & input_genre))
        recommendations = other_movies[other_movies['genres_overlap'] >0]
        # recommendations = recommendations.sort_values(
        #     by=['genres_overlap', 'IMDB_Rating'],
        #     ascending=[False, False]
        # )
        return{
            'movies you might like': recommendations[['Series_Title', 'Genre',]].to_dict("records")
        }
movies_you_might_like( 'the godfather')         

{'movies you might like': [{'Series_Title': 'the shawshank redemption',
   'Genre': 'drama'},
  {'Series_Title': 'the dark knight', 'Genre': 'action, crime, drama'},
  {'Series_Title': '12 angry men', 'Genre': 'crime, drama'},
  {'Series_Title': 'the lord of the rings: the return of the king',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'pulp fiction', 'Genre': 'crime, drama'},
  {'Series_Title': "schindler's list", 'Genre': 'biography, drama, history'},
  {'Series_Title': 'fight club', 'Genre': 'drama'},
  {'Series_Title': 'the lord of the rings: the fellowship of the ring',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'forrest gump', 'Genre': 'drama, romance'},
  {'Series_Title': 'the lord of the rings: the two towers',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'goodfellas', 'Genre': 'biography, crime, drama'},
  {'Series_Title': "one flew over the cuckoo's nest", 'Genre': 'drama'},
  {'Series_Title': 'hamilton', 'Genre': 'biography, d

# Data Preparation Practice Tasks  

## 1. **Data Inspection & Profiling**  
- Load the dataset and display the first 10 rows  
- Check data types of all columns using `.dtypes`  
- Generate summary statistics with `.describe(include='all')`  
- Identify columns with missing values using `.isnull().sum()`  

## 2. **Handling Duplicates**  
- Find duplicate rows using `.duplicated().sum()`  
- Remove duplicates while keeping the first occurrence  
- Verify the dataset size before/after deduplication  

## 3. **Missing Value Treatment**  
- Fill missing directors with "Unknown"  
- Drop rows with missing runtime values  
- Impute missing genres using the mode  
- Fill missing Meta_scores with column median  

## 4. **Outlier Detection & Correction**  
- Identify movies with IMDB_Rating > 10 or < 0  
- Handle invalid runtimes (e.g., "999 min", "10hrs")  
- Use boxplots to detect outliers in `No_of_Votes`  
- Correct negative/invalid Gross values  

## 5. **Format Standardization**  
### Runtime:  
- Extract numeric values (e.g., "100 mins" → 100)  
- Convert to integer and standardize to minutes  

### Year:  
- Remove "(re-release)" and other suffixes  
- Convert to 4-digit integer (e.g., "1998 Original" → 1998)  

### Votes:  
- Remove commas and "K" abbreviations (e.g., "1.5K" → 1500)  
- Convert to integer  

### Meta_score:  
- Remove "/10" suffix and convert to numeric  

### Gross:  
- Remove "$", "USD", and commas  
- Convert to numeric (handle "million" as 1,000,000)  

## 6. **Categorical Data Handling**  
- Normalize genre capitalization (e.g., "drama" → "Drama")  
- Correct typos in genres (e.g., "Dramaa" → "Drama")  
- Standardize certificates (e.g., "PG13" → "PG-13")  
- One-hot encode certificate categories  

## 7. **Data Type Conversion**  
- Convert Runtime, Year, Votes, Meta_score, and Gross to numeric types  
- Ensure Genre and Certificate are categorical types  

## 8. **Complex Pattern Handling**  
- Parse mixed-format runtimes (e.g., "2h30m" → 150)  
- Handle currency conversions (e.g., "€2.5M" → 2,500,000)  

## 9. **Final Validation**  
- Check for remaining missing values  
- Verify no invalid ratings (0 ≤ IMDB_Rating ≤ 10)  
- Ensure all runtime values are reasonable (30 ≤ Runtime ≤ 300)  
- Confirm standardized date formats  
 

4. Outlier Detection & Correction
Identify movies with IMDB_Rating > 10 or < 0
Handle invalid runtimes (e.g., "999 min", "10hrs")
Use boxplots to detect outliers in No_of_Votes
Correct negative/invalid Gross values

In [6]:
import pandas as pd

In [7]:
modified_csv = pd.read_csv('modified_imdb_top_1000.csv')
#below here type your code...
#1 data inspection and profilling-> loading
modified_csv.head(10)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,Adults,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0/10,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,"28,341,469 million"
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972 Original,Adults,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,1000.0%,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367 views,"€134,966,411"
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,U/A,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0/10,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534.858.444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,Year: 1974,Adults,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,900.0%,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952 votes,"USD 57,300,000"
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0/10,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,"€4,360,000"
5,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,Score: 94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758 views,"€377,845,905"
6,https://m.media-amazon.com/images/M/MV5BNGNhMD...,Pulp Fiction,1994,Adults,154 min,"Crime, Drama",8.9,"The lives of two mob hitmen, a boxer, a gangst...",Score: 94.0,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188 votes,107.928.762
7,https://m.media-amazon.com/images/M/MV5BNDE4OT...,Schindler's List,1993,Adults,195 min,"Biography, Drama, History",8.9,"In German-occupied Poland during World War II,...",Score: 94.0,Steven Spielberg,Liam Neeson,Ralph Fiennes,Ben Kingsley,Caroline Goodall,1213505 votes,"USD 96,898,818"
8,https://m.media-amazon.com/images/M/MV5BMjAxMz...,Inception,2010,U/A,148 min,"Action, Adventure, Sci-Fi",8.8,A thief who steals corporate secrets through t...,Score: 74.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042 votes,292.576.195
9,https://m.media-amazon.com/images/M/MV5BMmEzNT...,Fight Club,1999,Adults,139 min,Drama,8.8,An insomniac office worker and a devil-may-car...,660.0%,David Fincher,Brad Pitt,Edward Norton,Meat Loaf,Zach Grenier,1854K,"USD 37,030,102"


In [8]:
#checking data ytpes of columns
modified_csv['Poster_Link'].dtypes
modified_csv['Series_Title'].dtypes
modified_csv['Released_Year'].dtypes
modified_csv['Certificate'].dtypes
modified_csv['Runtime'].dtypes


dtype('O')

In [9]:
modified_csv.describe(include='all')

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
count,1006,1006,1006.0,905,1006,1006,1006.0,1006,843,1006,1006,1006,1006,1006,1006,1006
unique,996,995,303.0,18,142,203,,996,201,542,659,838,887,935,995,833
top,https://m.media-amazon.com/images/M/MV5BYmJhZm...,Drishyam,2014.0,U,101 min,Drama,,Desperate measures are taken by a man who trie...,800.0%,Alfred Hitchcock,Tom Hanks,Emma Watson,Samuel L. Jackson,Michael Caine,31K,unknown
freq,2,3,23.0,233,25,91,,2,12,14,12,7,5,5,3,169
mean,,,,,,,7.977137,,,,,,,,,
std,,,,,,,0.857449,,,,,,,,,
min,,,,,,,-1.0,,,,,,,,,
25%,,,,,,,7.7,,,,,,,,,
50%,,,,,,,7.9,,,,,,,,,
75%,,,,,,,8.1,,,,,,,,,


In [10]:
1# detecting missing values
modified_csv.isnull().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       163
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross              0
dtype: int64

2#handling duplicates


In [11]:
modified_csv= pd.read_csv("modified_imdb_top_1000.csv")
modified_csv.duplicated().value_counts()

False    1006
Name: count, dtype: int64

In [12]:
import pandas as pd

# Load the CSV file
modified = pd.read_csv("modified_imdb_top_1000.csv")

# Check for duplicate rows
modified= modified[modified.duplicated()]

# Display duplicates
print(modified)

Empty DataFrame
Columns: [Poster_Link, Series_Title, Released_Year, Certificate, Runtime, Genre, IMDB_Rating, Overview, Meta_score, Director, Star1, Star2, Star3, Star4, No_of_Votes, Gross]
Index: []


In [39]:
#removing duplicate on specified column
df.drop_duplicates(subset=["column_name"], inplace=True)


KeyError: Index(['column_name'], dtype='object')

In [None]:
#removing duplicate columns
modified = pd.read_csv("modified_imdb_top_1000.csv")
modified=modified.drop_duplicates()

In [None]:
#2_finding duplicate
m=modified_csv.duplicated().sum().sum()
# m.head()
# .sum()
m

In [None]:
#checking the size of csv
modified_csv.size

In [None]:
#removing duplicate
modified_csv.drop_duplicates(subset='Series_Title',keep='first')
modified_csv.to_csv('modified_imdb_top_1000.csv',index=False)


In [None]:
#checking the size again
modified_csv.size

#3
3. Missing Value Treatment
Fill missing directors with "Unknown"
Drop rows with missing runtime values
Impute missing genres using the mode
Fill missing Meta_scores with column median

In [14]:
#checking missing values
m=modified_csv.isnull().sum()
m

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       163
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross              0
dtype: int64

In [15]:
modified_csv= pd.read_csv('modified_imdb_top_1000.csv')
modified_csv["Director"]=modified_csv["Director"].fillna('unk')
# (value="known", inplace=True)
# df.method({"Director": "unknown"}, inplace=True)
# df.method({"Director": value}, inplace=True)
# df["Director"].fillna(value="known", inplace=True.....its working
df.to_csv("modified_imdb_top_1000.csv", index=False)

In [16]:
modified_csv['Director'].isnull().sum()

np.int64(0)

In [17]:
df= pd.read_csv('modified_imdb_top_1000.csv')
modified_csv["Gross"]=modified_csv["Gross"].fillna('unknown')
# df.to_csv("modified_imdb_top_1000.csv", index=False)

In [19]:
modified = pd.read_csv("modified_imdb_top_1000.csv")
modified.sample()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
472,https://m.media-amazon.com/images/M/MV5BMjEzOD...,Sing Street,2016,PG-13,106 min,"Comedy, Drama, Music",7.9,A boy growing up in Dublin during the 1980s es...,79.0,John Carney,Ferdia Walsh-Peelo,Aidan Gillen,Maria Doyle Kennedy,Jack Reynor,85109,3237118


In [20]:
modified = pd.read_csv("modified_imdb_top_1000.csv") 
modified.shape

(1000, 16)

In [21]:
#checking empty runtime rows
modified_csv['Runtime'].isnull().sum()

np.int64(0)

In [22]:
#removing nun runtime rows
modified_csv=pd.read_csv("modified_imdb_top_1000.csv") 
modified_csv = modified_csv.dropna(subset=["Runtime"])
modified_csv.to_csv("modified_imdb_top_1000.csv",index=False) 


In [23]:
#checking empty genre
modified_csv['Genre'].isnull().sum()

np.int64(0)

In [24]:
#imputing empty genre 
# df["column_name"] = df["column_name"].fillna(df["column_name"].mean())  # Mean
# df["column_name"] = df["column_name"].fillna(df["column_name"].median())  # Median
modified_csv["Genre"] = modified_csv["Genre"].fillna(modified_csv["Genre"].mode()[0])  # Mode
modified_csv.to_csv("modified_imdb_top_1000.csv",index=False) 

In [25]:
#proving the filling with fillna
modified_csv['Genre'].isnull().sum()

np.int64(0)

In [26]:
# empty meta scores
modified_csv['Meta_score'].isnull().sum()	

np.int64(157)

In [38]:
#proving the filling with filln
modified_csv['Meta_score'].isnull().sum()	

np.int64(0)

In [29]:
import pandas as pd

# Load the CSV file
modified_csv = pd.read_csv("modified_imdb_top_1000.csv")

# Clean Meta_score column (remove non-numeric characters)
modified_csv['Meta_score'] = modified_csv['Meta_score'].replace(r'[^0-9.]', '', regex=True)

# Convert to numeric type
modified_csv['Meta_score'] = pd.to_numeric(modified_csv['Meta_score'], errors='coerce')

# Compute median separately
median_value = modified_csv['Meta_score'].median()

# Fill missing values with median
modified_csv['Meta_score'] = modified_csv['Meta_score'].fillna(median_value)

print("Missing values imputed successfully!")


Missing values imputed successfully!


In [34]:
4. Outlier Detection & Correction
Identify movies with IMDB_Rating > 10 or < 0
Handle invalid runtimes (e.g., "999 min", "10hrs")
Use boxplots to detect outliers in No_of_Votes
Correct negative/invalid Gross values

SyntaxError: invalid syntax (2164481362.py, line 1)

In [42]:
modified_csv['IMDB_Rating'] = pd.to_numeric(modified_csv['IMDB_Rating'], errors='coerce')
modified_csv[modified_csv['IMDB_Rating']>10]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [43]:
modified_csv['IMDB_Rating'] = pd.to_numeric(modified_csv['IMDB_Rating'], errors='coerce')
modified_csv[(modified_csv['IMDB_Rating'] > 10) | (modified_csv['IMDB_Rating'] < 0)]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [44]:
# Handle invalid runtimes (e.g., "999 min", "10hrs")
modified_csv['Runtime'] = pd.to_numeric(modified_csv['Runtime'], errors='coerce')