In [4]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv('imdb_top_1000.csv')

# 2. Select relevant features
selected_features = ['Series_Title', 'Genre', 'Director', 
                    'Overview', 'IMDB_Rating']

movies_df = df[selected_features].copy()

# 3. Handle missing values
# Drop rows with missing values in critical fields
movies_df.dropna(subset=['Series_Title', 'Genre', 'Director', 
                         'Overview', 'IMDB_Rating'], 
                inplace=True)

# 4. Clean text data
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return text

# Apply cleaning to text columns
text_columns = ['Series_Title', 'Genre', 'Director','Overview']

for col in text_columns:
    movies_df[col] = movies_df[col].apply(clean_text)

# Select final columns
final_columns = ['Series_Title', 'Genre', 'Director', 
                'Overview', 'IMDB_Rating']

movies_df = movies_df[final_columns]



In [5]:
def movies_you_might_like(title) -> dict:
    if title and len(title) > 4:
        title_filter =movies_df['Series_Title'].str.contains(title, case=False,na=False)
        matched_movies = movies_df[title_filter]
        if matched_movies.empty:
            return {'error': 'No movies found with that title try to use the movie genre'}
        #getting genres from the first matching movie
        input_movie = matched_movies.iloc[0]
        input_genre = set(input_movie['Genre'].split(', '))
        
        other_movies = movies_df[~title_filter].copy()
        other_movies['genres_overlap']= other_movies['Genre'].apply(
            lambda x:len(set(x.split(', ')) & input_genre))
        recommendations = other_movies[other_movies['genres_overlap'] >0]
        # recommendations = recommendations.sort_values(
        #     by=['genres_overlap', 'IMDB_Rating'],
        #     ascending=[False, False]
        # )
        return{
            'movies you might like': recommendations[['Series_Title', 'Genre',]].to_dict("records")
        }
movies_you_might_like( 'the godfather')         

{'movies you might like': [{'Series_Title': 'the shawshank redemption',
   'Genre': 'drama'},
  {'Series_Title': 'the dark knight', 'Genre': 'action, crime, drama'},
  {'Series_Title': '12 angry men', 'Genre': 'crime, drama'},
  {'Series_Title': 'the lord of the rings: the return of the king',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'pulp fiction', 'Genre': 'crime, drama'},
  {'Series_Title': "schindler's list", 'Genre': 'biography, drama, history'},
  {'Series_Title': 'fight club', 'Genre': 'drama'},
  {'Series_Title': 'the lord of the rings: the fellowship of the ring',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'forrest gump', 'Genre': 'drama, romance'},
  {'Series_Title': 'the lord of the rings: the two towers',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'goodfellas', 'Genre': 'biography, crime, drama'},
  {'Series_Title': "one flew over the cuckoo's nest", 'Genre': 'drama'},
  {'Series_Title': 'hamilton', 'Genre': 'biography, d

# Data Preparation Practice Tasks  

## 1. **Data Inspection & Profiling**  
- Load the dataset and display the first 10 rows  
- Check data types of all columns using `.dtypes`  
- Generate summary statistics with `.describe(include='all')`  
- Identify columns with missing values using `.isnull().sum()`  

## 2. **Handling Duplicates**  
- Find duplicate rows using `.duplicated().sum()`  
- Remove duplicates while keeping the first occurrence  
- Verify the dataset size before/after deduplication  

## 3. **Missing Value Treatment**  
- Fill missing directors with "Unknown"  
- Drop rows with missing runtime values  
- Impute missing genres using the mode  
- Fill missing Meta_scores with column median  

## 4. **Outlier Detection & Correction**  
- Identify movies with IMDB_Rating > 10 or < 0  
- Handle invalid runtimes (e.g., "999 min", "10hrs")  
- Use boxplots to detect outliers in `No_of_Votes`  
- Correct negative/invalid Gross values  

## 5. **Format Standardization**  
### Runtime:  
- Extract numeric values (e.g., "100 mins" → 100)  
- Convert to integer and standardize to minutes  

### Year:  
- Remove "(re-release)" and other suffixes  
- Convert to 4-digit integer (e.g., "1998 Original" → 1998)  

### Votes:  
- Remove commas and "K" abbreviations (e.g., "1.5K" → 1500)  
- Convert to integer  

### Meta_score:  
- Remove "/10" suffix and convert to numeric  

### Gross:  
- Remove "$", "USD", and commas  
- Convert to numeric (handle "million" as 1,000,000)  

## 6. **Categorical Data Handling**  
- Normalize genre capitalization (e.g., "drama" → "Drama")  
- Correct typos in genres (e.g., "Dramaa" → "Drama")  
- Standardize certificates (e.g., "PG13" → "PG-13")  
- One-hot encode certificate categories  

## 7. **Data Type Conversion**  
- Convert Runtime, Year, Votes, Meta_score, and Gross to numeric types  
- Ensure Genre and Certificate are categorical types  

## 8. **Complex Pattern Handling**  
- Parse mixed-format runtimes (e.g., "2h30m" → 150)  
- Handle currency conversions (e.g., "€2.5M" → 2,500,000)  

## 9. **Final Validation**  
- Check for remaining missing values  
- Verify no invalid ratings (0 ≤ IMDB_Rating ≤ 10)  
- Ensure all runtime values are reasonable (30 ≤ Runtime ≤ 300)  
- Confirm standardized date formats  
 

4. Outlier Detection & Correction
Identify movies with IMDB_Rating > 10 or < 0
Handle invalid runtimes (e.g., "999 min", "10hrs")
Use boxplots to detect outliers in No_of_Votes
Correct negative/invalid Gross values

In [1]:
import pandas as pd

In [12]:
modified_csv = pd.read_csv('modified_imdb_top_1000.csv')
#below here type your code...
#1 data inspection and profilling-> loading
print(modified_csv.head(10))

                                         Poster_Link  \
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   
5  https://m.media-amazon.com/images/M/MV5BNzA5ZD...   
6  https://m.media-amazon.com/images/M/MV5BNGNhMD...   
7  https://m.media-amazon.com/images/M/MV5BNDE4OT...   
8  https://m.media-amazon.com/images/M/MV5BMjAxMz...   
9  https://m.media-amazon.com/images/M/MV5BMmEzNT...   

                                    Series_Title Released_Year Certificate  \
0                       The Shawshank Redemption          1994           A   
1                                  The Godfather          1972           A   
2                                The Dark Knight          2008          UA   
3                         The Godfather: Part II          1974         

In [6]:
#checking data ytpes of columns
modified_csv['Poster_Link'].dtypes
modified_csv['Series_Title'].dtypes
modified_csv['Released_Year'].dtypes
modified_csv['Certificate'].dtypes
modified_csv['Runtime'].dtypes


dtype('O')

In [13]:
#checking data ytpes of columns
modified_csv.dtypes

Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [14]:
print('/n== summery statistics==')
print(modified_csv.describe(include='all'))

/n== summery statistics==
                                              Poster_Link Series_Title  \
count                                                1000         1000   
unique                                               1000          999   
top     https://m.media-amazon.com/images/M/MV5BMTY5OD...     Drishyam   
freq                                                    1            2   
mean                                                  NaN          NaN   
std                                                   NaN          NaN   
min                                                   NaN          NaN   
25%                                                   NaN          NaN   
50%                                                   NaN          NaN   
75%                                                   NaN          NaN   
max                                                   NaN          NaN   

       Released_Year Certificate  Runtime  Genre  IMDB_Rating  \
count           1000

In [16]:
1# detecting missing values
print('==============')
print(modified_csv.isnull().sum())

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64


2#handling duplicates


In [22]:
modified_csv= pd.read_csv("modified_imdb_top_1000.csv")
modified_csv.duplicated().sum()

np.int64(0)

In [21]:
print('===initial duplicate===')
modified_csv=modified_csv.drop_duplicates(keep='first').reset_index(drop=True)
print(f'final_duplicates:{modified.duplicated().sum()}')
print(f'Dataset size after duplication :{len(modified_csv)}')
#print(modified_csv.duplicated().sum())

===initial duplicate===
final_duplicates:0
Dataset size after duplication :1000


In [12]:
import pandas as pd

# Load the CSV file
modified = pd.read_csv("modified_imdb_top_1000.csv")

# Check for duplicate rows
modified= modified[modified.duplicated()]

# Display duplicates
print(modified)

Empty DataFrame
Columns: [Poster_Link, Series_Title, Released_Year, Certificate, Runtime, Genre, IMDB_Rating, Overview, Meta_score, Director, Star1, Star2, Star3, Star4, No_of_Votes, Gross]
Index: []


In [None]:
#removing duplicate columns
modified = pd.read_csv("modified_imdb_top_1000.csv")
modified=modified.drop_duplicates()

In [23]:
#2_finding duplicate
m=modified_csv.duplicated().sum().sum()
# m.head()
# .sum()
m

np.int64(0)

In [24]:
#checking the size of csv
modified_csv.size

16000

In [None]:
#removing duplicate
modified_csv.drop_duplicates(subset='Series_Title',keep='first')
modified_csv.to_csv('modified_imdb_top_1000.csv',index=False)


In [None]:
#checking the size again
modified_csv.size

#3
3. Missing Value Treatment
Fill missing directors with "Unknown"
Drop rows with missing runtime values
Impute missing genres using the mode
Fill missing Meta_scores with column median

In [14]:
#checking missing values
m=modified_csv.isnull().sum()
m

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       163
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross              0
dtype: int64

In [None]:
modified_csv["Director"]=modified_csv["Director"].fillna('unknown')

In [15]:
modified_csv= pd.read_csv('modified_imdb_top_1000.csv')
modified_csv["Director"]=modified_csv["Director"].fillna('unk')
# (value="known", inplace=True)
# df.method({"Director": "unknown"}, inplace=True)
# df.method({"Director": value}, inplace=True)
# df["Director"].fillna(value="known", inplace=True.....its working
df.to_csv("modified_imdb_top_1000.csv", index=False)

In [16]:
modified_csv['Director'].isnull().sum()

np.int64(0)

In [17]:
df= pd.read_csv('modified_imdb_top_1000.csv')
modified_csv["Gross"]=modified_csv["Gross"].fillna('unknown')
# df.to_csv("modified_imdb_top_1000.csv", index=False)

In [19]:
modified = pd.read_csv("modified_imdb_top_1000.csv")
modified.sample()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
472,https://m.media-amazon.com/images/M/MV5BMjEzOD...,Sing Street,2016,PG-13,106 min,"Comedy, Drama, Music",7.9,A boy growing up in Dublin during the 1980s es...,79.0,John Carney,Ferdia Walsh-Peelo,Aidan Gillen,Maria Doyle Kennedy,Jack Reynor,85109,3237118


In [9]:
modified = pd.read_csv("modified_imdb_top_1000.csv") 
modified.shape

(1000, 16)

In [10]:
#checking empty runtime rows
modified_csv['Runtime'].isnull().sum()

np.int64(0)

In [11]:
#removing nun runtime rows
modified_csv=pd.read_csv("modified_imdb_top_1000.csv") 
modified_csv = modified_csv.dropna(subset=["Runtime"]) 
modified_csv.to_csv("modified_imdb_top_1000.csv",index=False) 


In [23]:
#checking empty genre
modified_csv['Genre'].isnull().sum()

np.int64(0)

In [24]:
#imputing empty genre 
# df["column_name"] = df["column_name"].fillna(df["column_name"].mean())  # Mean
# df["column_name"] = df["column_name"].fillna(df["column_name"].median())  # Median
modified_csv["Genre"] = modified_csv["Genre"].fillna(modified_csv["Genre"].mode()[0])  # Mode
modified_csv.to_csv("modified_imdb_top_1000.csv",index=False) 

In [25]:
#proving the filling with fillna
modified_csv['Genre'].isnull().sum()

np.int64(0)

In [26]:
# empty meta scores
modified_csv['Meta_score'].isnull().sum()	

np.int64(157)

In [38]:
#proving the filling with filln
modified_csv['Meta_score'].isnull().sum()	

np.int64(0)

In [29]:
import pandas as pd

# Load the CSV file
modified_csv = pd.read_csv("modified_imdb_top_1000.csv")

# Clean Meta_score column (remove non-numeric characters)
modified_csv['Meta_score'] = modified_csv['Meta_score'].replace(r'[^0-9.]', '', regex=True)

# Convert to numeric type
modified_csv['Meta_score'] = pd.to_numeric(modified_csv['Meta_score'], errors='coerce')

# Compute median separately
median_value = modified_csv['Meta_score'].median()

# Fill missing values with median
modified_csv['Meta_score'] = modified_csv['Meta_score'].fillna(median_value)

print("Missing values imputed successfully!")


Missing values imputed successfully!


4. Outlier Detection & Correction
Identify movies with IMDB_Rating > 10 or < 0
Handle invalid runtimes (e.g., "999 min", "10hrs")
Use boxplots to detect outliers in No_of_Votes
Correct negative/invalid Gross values

In [42]:
modified_csv['IMDB_Rating'] = pd.to_numeric(modified_csv['IMDB_Rating'], errors='coerce')
modified_csv[modified_csv['IMDB_Rating']>10]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [43]:
modified_csv['IMDB_Rating'] = pd.to_numeric(modified_csv['IMDB_Rating'], errors='coerce')
modified_csv[(modified_csv['IMDB_Rating'] > 10) | (modified_csv['IMDB_Rating'] < 0)]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross


In [44]:
# Handle invalid runtimes (e.g., "999 min", "10hrs")
modified_csv['Runtime'] = pd.to_numeric(modified_csv['Runtime'], errors='coerce')

5. Format Standardization
Runtime:
Extract numeric values (e.g., "100 mins" → 100)
Convert to integer and standardize to minutes
Year:
Remove "(re-release)" and other suffixes
Convert to 4-digit integer (e.g., "1998 Original" → 1998)
Votes:
Remove commas and "K" abbreviations (e.g., "1.5K" → 1500)
Convert to integer
Meta_score:
Remove "/10" suffix and convert to numeric
Gross:
Remove "$", "USD", and commas
Convert to numeric (handle "million" as 1,000,000)

In [90]:
modified_csv['Gross']=pd.to_numeric(modified_csv['Gross'],errors='coerce')
modified_csv.loc[0:100,'Gross']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
96    NaN
97    NaN
98    NaN
99    NaN
100   NaN
Name: Gross, Length: 101, dtype: float64

In [35]:
import re
modified_csv['Released_Year']=modified_csv['Released_Year'].str.extract(r'(\d{4})')
modified_csv['Released_Year']

0      1994
1      1972
2      2008
3      1974
4      1957
       ... 
995    1961
996    1956
997    1953
998    1944
999    1935
Name: Released_Year, Length: 1000, dtype: object

In [37]:
modified_csv['Released_Year']=pd.to_numeric(modified_csv['Released_Year'],errors='coerce')
modified_csv['Released_Year']

0      1994.0
1      1972.0
2      2008.0
3      1974.0
4      1957.0
        ...  
995    1961.0
996    1956.0
997    1953.0
998    1944.0
999    1935.0
Name: Released_Year, Length: 1000, dtype: float64

In [44]:
modified_csv['No_of_Votes']=modified_csv['No_of_Votes'].str.replace('K','000')
# removing words from votes

AttributeError: Can only use .str accessor with string values!

In [46]:
modified_csv['No_of_Votes']=modified_csv['No_of_Votes'].str.replace(',','',regex=False)
# removing  commas and replacing space from votes

AttributeError: Can only use .str accessor with string values!

In [54]:
modified_csv['No_of_Votes']

0      2343110
1      1620367
2      2303232
3      1129952
4       689845
        ...   
995     166544
996      34075
997      43374
998      26471
999      51853
Name: No_of_Votes, Length: 1000, dtype: int64

In [43]:
#coverting to interger
modified_csv['No_of_Votes']=pd.to_numeric(modified_csv['No_of_Votes'],errors='coerce')

In [76]:
modified_csv['Genre']=modified_csv['Genre'].str.title().replace('','')
modified_csv['Genre']=modified_csv['Genre'].replace({'Drama':'Crime','Dr':'Drama','Comedyy':'Comedy','Action':'kelvin'
    })

In [77]:
modified_csv.loc[0:20,'Genre']

0                          Crime
1                   Crime, Drama
2           Action, Crime, Drama
3                   Crime, Drama
4                   Crime, Drama
5       Action, Adventure, Drama
6                   Crime, Drama
7      Biography, Drama, History
8      Action, Adventure, Sci-Fi
9                          Crime
10      Action, Adventure, Drama
11                Drama, Romance
12                       Western
13      Action, Adventure, Drama
14                Action, Sci-Fi
15       Biography, Crime, Drama
16    Action, Adventure, Fantasy
17                         Crime
18     Biography, Drama, History
19       Comedy, Drama, Thriller
20                         Crime
Name: Genre, dtype: object

In [95]:
modified_csv['Certificate']=modified_csv['Certificate'].str.title().replace('','')
modified_csv['Certificate']=modified_csv['Certificate'].replace({'A':'PG18'
                                                                 ,'U':'unknown',
                                                                 'Ua':'PG13',
                                                                 ' ':'unknownn'
    })
modified_csv.to_csv('modified_imdb_top_1000.csv',index=False)

In [96]:
modified_csv.loc[0:700,'Certificate']

0         Pg18
1         Pg18
2         Pg13
3         Pg18
4      Unknown
        ...   
696         Pg
697       Pg18
698    Unknown
699       Pg18
700    Unknown
Name: Certificate, Length: 701, dtype: object

In [None]:
8. Complex Pattern Handling
Parse mixed-format runtimes (e.g., "2h30m" → 150)
Handle currency conversions (e.g., "€2.5M" → 2,500,000)

In [93]:
modified_csv['Runtime']=modified_csv['Runtime'].str.replace('M','000000')

In [94]:
modified_csv.loc[0:1000,'Runtime']

0      142 min
1      175 min
2      152 min
3      202 min
4       96 min
        ...   
995    115 min
996    201 min
997    118 min
998     97 min
999     86 min
Name: Runtime, Length: 1000, dtype: object

Year:
Remove "(re-release)" and other suffixes
Convert to 4-digit integer (e.g., "1998 Original" → 1998)
Votes:
Remove commas and "K" abbreviations (e.g., "1.5K" → 1500)
Convert to integer
Meta_score: