# Objective 
Analyze and manipulate a real-world dataset using Pandas to perform data exploration, cleaning, 
and advanced analysis

## Step 1 : Data Exploration

In [17]:
import pandas as pd

# Load the movies.csv, ratings.csv, and tags.csv using Pandas
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

# Display first 5 rows of each dataset to understand their structure
print("Movies Data:")
display(movies.head(5))  

print("\nRatings Data:")
display(ratings.head(5))

print("\nTags Data:")
display(tags.head(5))

Movies Data:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



Ratings Data:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580



Tags Data:


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [18]:
# Check the shape of each dataset
print("===== Dataset Shapes =====")
print(f"Movies Shape    : {movies.shape}")
print(f"Ratings Shape   : {ratings.shape}")
print(f"Tags Shape      : {tags.shape}\n")

# Identify missing values
print("===== Missing Values Check =====")
print("Movies Missing Values:\n", movies.isnull().any(), "\n")
print("Ratings Missing Values:\n", ratings.isnull().any(), "\n")
print("Tags Missing Values:\n", tags.isnull().any(), "\n")

===== Dataset Shapes =====
Movies Shape    : (27278, 3)
Ratings Shape   : (20000263, 4)
Tags Shape      : (465564, 4)

===== Missing Values Check =====
Movies Missing Values:
 movieId    False
title      False
genres     False
dtype: bool 

Ratings Missing Values:
 userId       False
movieId      False
rating       False
timestamp    False
dtype: bool 

Tags Missing Values:
 userId       False
movieId      False
tag           True
timestamp    False
dtype: bool 



In [19]:
# Tags contains missing values, so we drop them
tags.dropna

# confirm no missing values
print(tags.isnull().any())

userId       False
movieId      False
tag           True
timestamp    False
dtype: bool


 ## Step 2: Data Cleaning

In [21]:
# Transform the genres Column
# split it into multiple columns using str.split('|',expand=true)
movie_genres= movies['genres'].str.split('|',expand=True)
print(movie_genres.head())

# add a new column iscomedy that flags movies with the genre comedy, use str.contains()
movies['IsComedy']= movies['genres'].str.contains('Comedy').map({True: 'Yes', False: 'No'})

print(movies[['title','genres', 'IsComedy']].head())

           0          1         2       3        4     5     6     7     8  \
0  Adventure  Animation  Children  Comedy  Fantasy  None  None  None  None   
1  Adventure   Children   Fantasy    None     None  None  None  None  None   
2     Comedy    Romance      None    None     None  None  None  None  None   
3     Comedy      Drama   Romance    None     None  None  None  None  None   
4     Comedy       None      None    None     None  None  None  None  None   

      9  
0  None  
1  None  
2  None  
3  None  
4  None  
                                title  \
0                    Toy Story (1995)   
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   

                                        genres IsComedy  
0  Adventure|Animation|Children|Comedy|Fantasy      Yes  
1                   Adventure|Children|Fantasy       No  
2                               Comedy|Romance    

In [23]:
# Extract release year from the title using a regex. Create a new column for year
movies['ReleaseYear']= movies['title'].str.extract(r'\((\d{4})\)')

print(movies[['title','ReleaseYear']].head())

                                title ReleaseYear
0                    Toy Story (1995)        1995
1                      Jumanji (1995)        1995
2             Grumpier Old Men (1995)        1995
3            Waiting to Exhale (1995)        1995
4  Father of the Bride Part II (1995)        1995


## Step 3: Data Analysis

1. Descriptive statistics
   Generate summary statistics for the ratings dataset using .describe().    Identify the most frequent rating using .mode().
2. Analyze ratings
  Calculate the average rating for each movie using groupby() and .mean().  o Filter and display movies with an average rating of 5.0. 
3. Time-series analys
   Convert the timestamp column in the ratings dataset to a datetime format using    
pd.to_datetime().    
o Filter rows where parsed_time is after February 1, 2015    
o Sort the data by parsed_time in ascending order.is

In [26]:
## 1) descriptive statistics

#generate summary statistics using describe
print(ratings.describe())

print(ratings.mode()) #most frequent rating is 4.0

             userId       movieId        rating     timestamp
count  2.000026e+07  2.000026e+07  2.000026e+07  2.000026e+07
mean   6.904587e+04  9.041567e+03  3.525529e+00  1.100918e+09
std    4.003863e+04  1.978948e+04  1.051989e+00  1.621694e+08
min    1.000000e+00  1.000000e+00  5.000000e-01  7.896520e+08
25%    3.439500e+04  9.020000e+02  3.000000e+00  9.667977e+08
50%    6.914100e+04  2.167000e+03  3.500000e+00  1.103556e+09
75%    1.036370e+05  4.770000e+03  4.000000e+00  1.225642e+09
max    1.384930e+05  1.312620e+05  5.000000e+00  1.427784e+09
   userId  movieId  rating  timestamp
0  118205      296     4.0  825638400


In [27]:
## 2) Analyze ratings
#calculate avg rating for each movie
avg_ratings= ratings.groupby('movieId')['rating'].mean()

# Merge the avg ratings back with the movies DataFrame
movies_with_avg_rating = movies.merge(avg_ratings, on='movieId', how='left')

# Filter and display movies with an average rating of 5
movies_with_avg_rating_5 = movies_with_avg_rating[movies_with_avg_rating['rating'] == 5]

# Display the result
print(movies_with_avg_rating_5[['title', 'rating']])

                                                   title  rating
9007       Life On A String (Bian chang Bian Zou) (1991)     5.0
9561   Hijacking Catastrophe: 9/11, Fear & the Sellin...     5.0
9862             Snow Queen, The (Lumikuningatar) (1986)     5.0
10567                                Al otro lado (2004)     5.0
12015                                  Sierra, La (2005)     5.0
...                                                  ...     ...
27091                        Bill Hicks: Sane Man (1989)     5.0
27093                           The Wrecking Crew (2008)     5.0
27155  The Garden of Sinners - Chapter 5: Paradox Par...     5.0
27189                         The Beautiful Story (1992)     5.0
27201  Stargate SG-1 Children of the Gods - Final Cut...     5.0

[113 rows x 2 columns]


In [28]:
## 3) time series analysis

# Convert the timestamp column to datetime in ratings DataFrame
ratings['parsed_time'] = pd.to_datetime(ratings['timestamp'], unit='s')

# Filter rows where parsed_time is after February 1, 2015
filtered_ratings = ratings[ratings['parsed_time'] > '2015-02-01']

# Sort the data by parsed_time in ascending order
sorted_ratings = filtered_ratings.sort_values(by='parsed_time', ascending=True)

# Display the first few rows to verify
print(sorted_ratings[['movieId', 'rating', 'parsed_time']].head())

          movieId  rating         parsed_time
12703376   102481     2.5 2015-02-01 00:00:25
5912263      2428     3.0 2015-02-01 00:03:12
12703402   116823     4.0 2015-02-01 00:03:15
5912451      8910     3.0 2015-02-01 00:03:31
10041669   115617     3.0 2015-02-01 00:04:00


## Step 4: Data Merging
1) Combine datasets
     Merge movies and tags datasets on movieId.     Merge movies with the average ratings calculated in Step 3 to create a box_offic 
datase2
3) GenreSpecific analys
    Filter movies in the box_office dataset that belong to the genre "Animation".     Identify highly rated comedy movies (rating ≥ 4.0 and IsComedy is True).is

In [30]:
# 1) combine data sets
# merge movies and tags datasets on movieid
movie_tag= movies.merge(tags, on='movieId', how='left')
print(movie_tag.head())

# merge movies with avg ratings calculated in step 3 to create a  box_office dataset
box_office = movies.merge(avg_ratings, on='movieId', how='left')
print(box_office.head())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

  IsComedy ReleaseYear  userId                                     tag  \
0      Yes        1995  1644.0                                 Watched   
1      Yes        1995  1741.0                      computer animation   
2      Yes        1995  1741.0                 Disney animated feature   
3      Yes        1995  1741.0                         Pixar animation   
4      Yes        1995  1741.0  TÃ©a Leoni does not star in this movie   

      timestamp  
0  1.417737e+09  
1  1.183903e+09  
2  1.183933e+09  
3  1.183935e+09  
4 

In [31]:
# 2) genre specific analysis
#filter movies in box_office data set that belong to animation genre
animated_movies = box_office[box_office['genres'].str.contains('Animation', case=False, na=False)]
print(animated_movies[['movieId', 'title', 'genres']].head())

#identify highly rated comedy movies  >=4 and iscomedy true
comedies=box_office[(box_office['IsComedy']== True) &(box_office['rating'] >= 4)]
print(comedies[['movieId', 'title', 'genres', 'rating', 'IsComedy']].head())

     movieId                    title  \
0          1         Toy Story (1995)   
12        13             Balto (1995)   
47        48        Pocahontas (1995)   
236      239    Goofy Movie, A (1995)   
241      244  Gumby: The Movie (1995)   

                                          genres  
0    Adventure|Animation|Children|Comedy|Fantasy  
12                  Adventure|Animation|Children  
47      Animation|Children|Drama|Musical|Romance  
236            Animation|Children|Comedy|Romance  
241                           Animation|Children  
Empty DataFrame
Columns: [movieId, title, genres, rating, IsComedy]
Index: []


## Step 5: Correlation Analysis

In [32]:
# analyze corerlation between release year and avg rating using .corr
correlation = box_office[['ReleaseYear', 'rating']].corr()

# Display the correlation matrix
print(correlation)

             ReleaseYear    rating
ReleaseYear     1.000000 -0.051535
rating         -0.051535  1.000000


In [33]:
# calculate yearly avg rating for all movies using groupby and mean
yearly_avg_rating = box_office.groupby('ReleaseYear')['rating'].mean()

# Display the yearly average ratings
print(yearly_avg_rating)

ReleaseYear
1891    3.000000
1893    3.375000
1894    3.071429
1895    3.125000
1896    3.183036
          ...   
2011    3.162567
2012    3.104176
2013    3.145056
2014    3.122809
2015    2.813146
Name: rating, Length: 118, dtype: float64
