In [None]:
# Importing necessary libraries and functions
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import sys

warnings.filterwarnings("ignore", category=FutureWarning)
sys.path.append('../src')

from data_preprocessing import (
    calculate_user_stats,
    identify_inconsistent_users,
    calculate_user_weights,
    merge_user_weights,
    calculate_user_weighted_ratings,
    get_genre_columns,
    calculate_genre_avg_ratings,
    calculate_user_genre_interactions
)

from utils import (
    load_visualization_data, 
    plot_age_distribution, 
    plot_gender_genre_popularity, 
    plot_genre_popularity, 
    plot_movie_avg_ratings, 
    plot_user_rating_behavior, 
    plot_most_popular_movies, 
    plot_rating_density_by_year, 
    plot_yearly_genre_popularity, 
    plot_correlation_heatmap, 
    plot_age_group_genre_popularity, 
    plot_gender_age_rating, 
    plot_genre_rating_correlation, 
    plot_user_rating_consistency
)

# File paths
input_path = "../data/raw/"
output_path = "../data/raw/"

# Convert u.data to CSV
ratings_columns = ["user_id", "item_id", "rating", "timestamp"]
ratings = pd.read_csv(input_path + "u.data", sep="\t", names=ratings_columns, encoding="latin-1")
ratings.to_csv(output_path + "ratings.csv", index=False)
ratings['rating_year'] = pd.to_datetime(ratings['timestamp'], unit='s').dt.year

# Convert u.item to CSV
movies_columns = [
    "movie_id", "title", "release_date", "video_release_date", "IMDb_URL",
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies = pd.read_csv(input_path + "u.item", sep="|", names=movies_columns, encoding="latin-1")
movies.to_csv(output_path + "movies.csv", index=False)

# Convert u.user to CSV
users_columns = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(input_path + "u.user", sep="|", names=users_columns, encoding="latin-1")
users.to_csv(output_path + "users.csv", index=False)

print("Files have been successfully converted to CSV format and saved in the ../data/raw folder!")

In [None]:
# File paths
data_path = "../data/raw/"

# Load ratings data
ratings = pd.read_csv(data_path + "ratings.csv").copy()

# Load movies data
movies = pd.read_csv(data_path + "movies.csv").copy()

# Load users data
users = pd.read_csv(data_path + "users.csv").copy()

print("Datasets have been successfully loaded!")

In [None]:
# Step 1: Load the datasets for visualization (already loaded as users, movies, and ratings)
# These datasets will be used specifically for visualization purposes.
users_data = users.copy()
movies_data = movies.copy()
ratings_data = ratings.copy()

# Step 2: Generate and save all visualizations
# Plot age distribution of users
plot_age_distribution(users_data)

# Gender-based genre popularity
plot_gender_genre_popularity(users_data, movies_data, genre_columns=['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Genre popularity
plot_genre_popularity(movies_data, genre_columns=['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Rating behavior of users
plot_user_rating_behavior(ratings_data)

# Most popular movies based on rating count
plot_most_popular_movies(ratings_data)

# Rating density over years
plot_rating_density_by_year(ratings_data)

# Genre popularity by year
plot_yearly_genre_popularity(ratings_data, movies_data, genre_columns=['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Correlation heatmap between numerical features
plot_correlation_heatmap(ratings_data)

# User rating consistency
plot_user_rating_consistency(ratings_data)

In [None]:
# Step 1: Calculate user statistics
user_stats = calculate_user_stats(ratings)

In [None]:
# Step 2: Identify inconsistent and consistent users
inconsistent_users, consistent_users = identify_inconsistent_users(user_stats)

In [None]:
# Step 3: Calculate user weights
user_weights = calculate_user_weights(ratings)

In [None]:
# Step 4: Merge user weights into the ratings dataset
ratings = merge_user_weights(ratings, user_weights)

In [None]:
# Step 5: Get genre columns from movies dataset
genre_columns = get_genre_columns(movies)

In [None]:
# Define age groups
bins = [0, 18, 25, 35, 50, 100]  # Example age ranges (to cover above 100 years old)
labels = ['0-18', '19-25', '26-35', '36-50', '50+']  # Age group labels

# Group users' ages
users['age_group'] = pd.cut(users['age'], bins=bins, labels=labels, right=False)

# Now we can run the function
weighted_ratings = calculate_user_weighted_ratings(ratings, users, movies, genre_columns)

In [None]:
# Step 7: Calculate genre-specific average ratings
genre_avg_ratings = calculate_genre_avg_ratings(movies, weighted_ratings, genre_columns)

In [None]:
# Step 8: Calculate user-genre interactions
user_genre_interactions = calculate_user_genre_interactions(weighted_ratings, movies)

print("Data preprocessing completed successfully!")

In [None]:
# Check the directory and create it if it doesn't exist
output_dir = "../data/processed"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Merge ratings with user info
ratings_with_user_info = ratings.merge(users[['user_id', 'age_group', 'gender']], on='user_id', how='left')

# Step 2: Merge weighted ratings with movies and genre information
# Merge weighted_ratings with movies based on the matching columns 'item_id' and 'movie_id'
weighted_ratings_with_movies = weighted_ratings.merge(movies[['movie_id', 'title'] + genre_columns], left_on='item_id', right_on='movie_id', how='left')

# Step 3: Save the merged data into CSV files in the specified directory
ratings_with_user_info.to_csv(os.path.join(output_dir, 'ratings_with_user_info.csv'), index=False)
weighted_ratings_with_movies.to_csv(os.path.join(output_dir, 'weighted_ratings_with_movies.csv'), index=False)

# Step 4: Save genre-specific average ratings
genre_avg_ratings.to_csv(os.path.join(output_dir, 'genre_avg_ratings.csv'), index=True)

# Step 5: Save any additional combined data (example: user-genre interactions)
user_genre_interactions.to_csv(os.path.join(output_dir, 'user_genre_interactions.csv'), index=False)

print(f"Files saved to {output_dir}")