<a href="https://colab.research.google.com/github/jeevapriyagp/Data_Mining_Movie_Data/blob/main/DM_Movie_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")  # suppress all warnings

In [None]:
!git clone https://github.com/jeevapriyagp/Data_Mining_Movie_Data.git

In [None]:
%cd Data_Mining_Movie_Data/

/content/Data_Mining_Movie_Data


In [None]:
!git config --global user.email "jeevapriyapoopathi@gmail.com"
!git config --global user.name "jeevapriyagp"

In [None]:
!pip install gradio



In [10]:
import pandas as pd
import gradio as gr
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.ensemble import IsolationForest

file_path_movies = "/content/Data_Mining_Movie_Data/movies_with_images.xlsx"
df_movies = pd.read_excel(file_path_movies, sheet_name="Sheet1")

file_path_watchlist = "/content/Data_Mining_Movie_Data/user_watchlist_grouped.csv"
df_watchlist = pd.read_csv(file_path_watchlist)
df_watchlist["Movies_Watched"] = df_watchlist["Movies_Watched"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

# Extract relevant columns for genre and director clustering
df_cluster = df_movies[['Title', 'Genre', 'Director']].dropna()

# Encode genres numerically
label_encoder = LabelEncoder()
df_cluster['Genre_Encoded'] = label_encoder.fit_transform(df_cluster['Genre'])

# Perform genre clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df_cluster['Cluster'] = kmeans.fit_predict(df_cluster[['Genre_Encoded']])

# Map clusters to genres
genre_mapping = df_cluster.groupby('Cluster')['Genre'].first().to_dict()
df_cluster['Genre_Label'] = df_cluster['Cluster'].map(genre_mapping)

# Get unique genres and directors
genres = sorted(df_cluster['Genre_Label'].unique())
directors = sorted(df_cluster['Director'].unique())


# Function to get statistics about movie-watching habits
def movie_watch_stats():
    total_users = len(df_watchlist)
    total_movies = len(movie_counts)
    avg_movies_per_user = sum(len(movies.split(', ')) for movies in df_watchlist['Movies_Watched']) / total_users
    return f"Movie Watchlist Statistics\n**Total Users:** {total_users}\n**Total Unique Movies:** {total_movies}\n**Average Movies Watched Per User:** {avg_movies_per_user:.2f}"


# Function to plot top N movies
def plot_top_movies(n=5):
    top_n_movies = movie_counts.most_common(n)
    movies, counts = zip(*top_n_movies)
    plt.figure(figsize=(8, 5))
    plt.barh(movies, counts, color='royalblue')
    plt.xlabel("Number of Users")
    plt.ylabel("Movies")
    plt.title(f"Top {n} Most Watched Movies")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig("top_movies_plot.png")
    return "top_movies_plot.png"

# Function to get movies by selected genre
def get_movies_by_genre(selected_genre):
    movies = df_cluster[df_cluster['Genre_Label'] == selected_genre]['Title'].tolist()
    return "\n".join(movies) if movies else "No movies found for this genre."

# Function to get movies by selected director
def get_movies_by_director(selected_director):
    movies = df_cluster[df_cluster['Director'] == selected_director]['Title'].tolist()
    return "\n".join(movies) if movies else "No movies found for this director."

# Process movie data for Apriori
movie_counts = Counter(movie.strip() for movies in df_watchlist['Movies_Watched'].str.split(', ') for movie in movies)
movie_lists = df_watchlist["Movies_Watched"].str.split(', ').tolist()

# Prepare data for Apriori
te = TransactionEncoder()
te_ary = te.fit(movie_lists).transform(movie_lists)
basket_df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(basket_df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01)

# Function to recommend movies
def recommend_movie(selected_movie):
    related_rules = rules[rules["antecedents"].apply(lambda x: selected_movie in x)]
    recommended = set()
    for _, row in related_rules.iterrows():
        recommended.update(row["consequents"])
    return list(recommended) if recommended else ["No Movies to Recommend"]

# User Clustering using KMeans
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
X = vectorizer.fit_transform(df_watchlist['Movies_Watched'].astype(str))
num_user_clusters = 10
kmeans_users = KMeans(n_clusters=num_user_clusters, random_state=42)
df_watchlist['Cluster'] = kmeans_users.fit_predict(X)

# Function to plot user clustering
def plot_user_clusters():
    plt.figure(figsize=(10, 6))
    plt.bar(df_watchlist['Cluster'].value_counts().index, df_watchlist['Cluster'].value_counts().values, color='royalblue', edgecolor='black')
    plt.xlabel("Cluster")
    plt.ylabel("Number of Users")
    plt.xticks(rotation=45)
    plt.title("User Clustering Based on Movie Preferences")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.savefig("user_cluster_distribution.png")
    return "user_cluster_distribution.png"

# Function to get top N most-watched movies
def top_movies(n=5):
    return dict(movie_counts.most_common(n))

# Function to get top N users who watched the most movies
def top_users(n=5):
    user_movie_counts = df_watchlist.set_index("User_ID")["Movies_Watched"].str.count(', ') + 1
    return user_movie_counts.nlargest(n).to_dict()

# Function to plot top N users who watched the most movies
def plot_top_users(n=5):
    user_movie_counts = df_watchlist.set_index("User_ID")["Movies_Watched"].str.count(', ') + 1
    top_users = user_movie_counts.nlargest(n)
    plt.figure(figsize=(8, 5))
    plt.barh(top_users.index.astype(str), top_users.values, color='royalblue')
    plt.xlabel("Number of Movies Watched")
    plt.ylabel("Users")
    plt.title(f"Top {n} Users with Most Watched Movies")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig("top_users_plot.png")
    return "top_users_plot.png"

# Function to detect outliers (rarely watched movies)
def detect_rare_movies():
    rare_movies = [movie for movie, count in movie_counts.items() if count < 5]  # Adjusted threshold
    return "\n".join(rare_movies) if rare_movies else "No rare movies found."

# Function to visualize rarely watched movies
def plot_rare_movies():
    rare_movie_counts = {movie: count for movie, count in movie_counts.items() if count < 5}  # Adjusted threshold
    if not rare_movie_counts:
        return "No rare movies to display."
    sorted_rare_movies = sorted(rare_movie_counts.items(), key=lambda x: x[1])[:50]  # Limit to top 50
    plt.figure(figsize=(12, 15))
    plt.barh([x[0] for x in sorted_rare_movies], [x[1] for x in sorted_rare_movies], color='red', edgecolor='black')
    plt.xlabel("Watch Count")
    plt.ylabel("Movies")
    plt.title("Rarely Watched Movies")
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig("rare_movies_plot.png")
    return "rare_movies_plot.png"

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🎬Movie Dashboard")

    with gr.Tab("Statistics"):
        gr.Markdown("User Watchlist Statistics")
        stats_result = gr.Textbox()
        stats_btn = gr.Button("Get Statistics")
        stats_btn.click(movie_watch_stats, outputs=stats_result)

    with gr.Tab("Top N Movies"):
        gr.Markdown("🔥 Top N Movies")
        top_n = gr.Number(value=5, label="Top N Movies")
        top_result = gr.Textbox()
        top_btn = gr.Button("Get Top Movies")
        top_btn.click(top_movies, inputs=top_n, outputs=top_result)

    with gr.Tab("Top N Movies Visualization"):
        gr.Markdown("📈 Top N Movies Visualization")
        top_n_plot = gr.Number(value=5, label="Top N Movies for Plot")
        plot_result = gr.Image()
        plot_btn = gr.Button("Show Top Movies Chart")
        plot_btn.click(plot_top_movies, inputs=top_n_plot, outputs=plot_result)

    with gr.Tab("Top N Users"):
        gr.Markdown("🏆 Top N Users")
        top_users_n = gr.Number(value=5, label="Top N Users")
        top_users_result = gr.Textbox()
        top_users_btn = gr.Button("Get Top Users")
        top_users_btn.click(top_users, inputs=top_users_n, outputs=top_users_result)

    with gr.Tab("Top N Users Visualization"):
        gr.Markdown("📊 Top N Users Visualization")
        top_users_plot_n = gr.Number(value=5, label="Top N Users for Plot")
        plot_users_result = gr.Image()
        plot_users_btn = gr.Button("Show Top Users Chart")
        plot_users_btn.click(plot_top_users, inputs=top_users_plot_n, outputs=plot_users_result)


    with gr.Tab("Genre Clustering"):
        gr.Markdown("🎭 Movie Genre Clustering")
        genre_dropdown = gr.Dropdown(choices=genres, label="Select Genre")
        genre_result = gr.Textbox()
        genre_btn = gr.Button("Get Movies by Genre")
        genre_btn.click(get_movies_by_genre, inputs=genre_dropdown, outputs=genre_result)

    with gr.Tab("Movie Director Filtering"):
        gr.Markdown("Director-based movie Filtering")
        director_dropdown = gr.Dropdown(choices=directors, label="Select Director")
        director_result = gr.Textbox()
        director_btn = gr.Button("Get Movies by Director")
        director_btn.click(get_movies_by_director, inputs=director_dropdown, outputs=director_result)

    with gr.Tab("Movie Recommendations"):
        gr.Markdown("🎥 Movie Recommendations (Apriori)")
        movie_dropdown = gr.Dropdown(choices=sorted(set(movie_counts.keys())), label="Select a Movie")
        recommend_result = gr.Textbox()
        recommend_btn = gr.Button("Get Recommendations")
        recommend_btn.click(recommend_movie, inputs=movie_dropdown, outputs=recommend_result)

    with gr.Tab("User Clustering"):
        gr.Markdown("👥 User Clustering based on movie preferences")
        user_cluster_plot = gr.Image()
        user_cluster_btn = gr.Button("Show User Cluster Distribution")
        user_cluster_btn.click(plot_user_clusters, outputs=user_cluster_plot)

    with gr.Tab("Rare movies(Outlier)"):
        gr.Markdown("🎭 Rarely Watched Movies using outlier analysis")
        rare_movies_result = gr.Textbox()
        rare_movies_btn = gr.Button("Get Rare Movies")
        rare_movies_btn.click(detect_rare_movies, outputs=rare_movies_result)
        rare_movies_plot_result = gr.Image()
        rare_movies_plot_btn = gr.Button("Show Rare Movies Chart")
        rare_movies_plot_btn.click(plot_rare_movies, outputs=rare_movies_plot_result)

# Launch the app
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7e6ccaf9a2e0eb316a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


