# Business Understanding


# Data Understanding

In [4]:
import os
import pandas as pd

# Define the folder path for the "movierec" folder on your desktop
folder_path = os.path.join(os.path.expanduser("~"), "Desktop", "movierec")

# Define file paths for each CSV file
file_paths = {
    'movies': os.path.join(folder_path, 'movies.csv'),
    'links': os.path.join(folder_path, 'links.csv'),
    'ratings': os.path.join(folder_path, 'ratings.csv'),
    'tags': os.path.join(folder_path, 'tags.csv')
}

# Create an empty dictionary to store DataFrames
dfs = {}

# Read each CSV file into a DataFrame and store it in the dictionary
for key, path in file_paths.items():
    dfs[key] = pd.read_csv(path)

# Check the number of rows in each DataFrame and display the first few rows with column names
for key, df in dfs.items():
    print(f"DataFrame: {key}, Number of Rows: {df.shape[0]}")
    print(df.head())  # Display the first few rows of the DataFrame with column names
    print("\n")


DataFrame: movies, Number of Rows: 9742
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


DataFrame: links, Number of Rows: 9742
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


DataFrame: ratings, Number of Rows: 100836
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  9649812

In [5]:
# Merge 'movies' DataFrame with 'ratings' DataFrame on 'movieId'
merged_df = pd.merge(dfs['movies'], dfs['ratings'], on='movieId', how='inner')

# Merge 'merged_df' with 'tags' DataFrame on 'movieId'
merged_df = pd.merge(merged_df, dfs['tags'], on='movieId', how='inner')

# Merge 'merged_df' with 'links' DataFrame on 'movieId'
merged_df = pd.merge(merged_df, dfs['links'], on='movieId', how='inner')

# Display the first few rows of the merged DataFrame
print(merged_df.head())


   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId_x  rating  timestamp_x  userId_y    tag  timestamp_y  imdbId  tmdbId  
0         1     4.0    964982703       336  pixar   1139045764  114709   862.0  
1         1     4.0    964982703       474  pixar   1137206825  114709   862.0  
2         1     4.0    964982703       567    fun   1525286013  114709   862.0  
3         5     4.0    847434962       336  pixar   1139045764  114709   862.0  
4         5     4.0    847434962       474  pixar   1137206825  114709   862.0  


In [6]:
# Print all the columns of the merged DataFrame
print(merged_df.columns)


Index(['movieId', 'title', 'genres', 'userId_x', 'rating', 'timestamp_x',
       'userId_y', 'tag', 'timestamp_y', 'imdbId', 'tmdbId'],
      dtype='object')


| Column                | Description                                                    |
|-----------------------|----------------------------------------------------------------|
| movieId               | Unique identifier for each movie                               |
| title                 | Title of the movie along with the release year                 |
| genres                | Genres associated with the movie, separated by '|'            |
| userId_x              | User ID of the user who provided the rating                    |
| rating                | Rating given to the movie by the user                          |
| timestamp_x           | Timestamp when the rating was given by the user                |
| userId_y              | User ID of the user who applied the tag                        |
| tag                   | Tag applied to the movie by the user                           |
| timestamp_y           | Timestamp when the tag was applied by the user                 |
| imdbId                | IMDb ID of the movie                                            |
| tmdbId                | TMDb ID of the movie                                            |


# Data Preperation 

# Exploratory Data Analysis

# Modeling

# Conclusions

# Recommendations