In [1]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from pymongo import MongoClient
import os

# Set up page configuration
st.set_page_config(
    page_title="AFI Top 100 Movies Analysis",
    page_icon="🎬",
    layout="wide"
)

# Title and introduction
st.title("🎬 AFI's 100 Years...100 Movies Analysis")
st.markdown("""
An interactive exploration of the American Film Institute's list of the 100 greatest American movies.
""")


# Connect to MongoDB with fallback to CSV
@st.cache_resource
def get_data():
    try:
        client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=5000)
        client.admin.command('ping')
        db = client['movie_database']
        collection = db['afi_top_100']
        data = list(collection.find({}, {'_id': 0}))
        df = pd.DataFrame(data)
        client.close()
        st.success("Loaded data from MongoDB")
    except Exception as e:
        st.warning(f"MongoDB connection failed: {e}. Trying CSV fallback...")
        if os.path.exists('afi_top_100_cleaned.csv'):
            df = pd.read_csv('afi_top_100_cleaned.csv')
            st.success("Loaded data from afi_top_100_cleaned.csv")
        else:
            st.error("""
            No data source available. Please:
            1. Ensure MongoDB is running on localhost:27017.
            2. Run the data processing script to generate afi_top_100_cleaned.csv.
            """)
            return None
    return df


df = get_data()

if df is not None:
    # Convert year to numeric (handle non-numeric values)
    df['Release year'] = pd.to_numeric(df['Release year'], errors='coerce').fillna(0).astype(int)

    # Ensure Title_Sentiment is numeric
    if 'Title_Sentiment' in df.columns:
        df['Title_Sentiment'] = pd.to_numeric(df['Title_Sentiment'], errors='coerce').fillna(0.0)

    # Sidebar filters
    st.sidebar.header("Filter Options")
    selected_decades = st.sidebar.multiselect(
        "Select Decades",
        options=sorted((df['Release year'] // 10 * 10).unique()),
        default=sorted((df['Release year'] // 10 * 10).unique())
    )

    selected_directors = st.sidebar.multiselect(
        "Select Directors",
        options=sorted(df['Director'].unique()),
        default=[]
    )

    search_term = st.sidebar.text_input("Search by Film or Director", "")

    # Apply filters
    filtered_df = df
    if selected_decades:
        filtered_df = filtered_df[(filtered_df['Release year'] // 10 * 10).isin(selected_decades)]
    if selected_directors:
        filtered_df = filtered_df[filtered_df['Director'].isin(selected_directors)]
    if search_term:
        filtered_df = filtered_df[
            filtered_df['Film'].str.contains(search_term, case=False, na=False) |
            filtered_df['Director'].str.contains(search_term, case=False, na=False)
            ]

    # Main content tabs
    tab1, tab2, tab3, tab4 = st.tabs(["Dataset", "Visualizations", "Statistics", "Word Cloud"])

    with tab1:
        st.header("Movie Dataset")
        st.dataframe(filtered_df.sort_values('Release year'), use_container_width=True)

        st.download_button(
            label="Download Data as CSV",
            data=filtered_df.to_csv(index=False).encode('utf-8'),
            file_name='afi_top_100_movies.csv',
            mime='text/csv'
        )

    with tab2:
        st.header("Data Visualizations")

        col1, col2 = st.columns(2)

        with col1:
            st.subheader("Movies by Decade")
            filtered_df['Decade'] = (filtered_df['Release year'] // 10) * 10
            fig1, ax1 = plt.subplots(figsize=(10, 6))
            sns.countplot(data=filtered_df, x='Decade', hue='Decade', palette='viridis', ax=ax1, legend=False)
            plt.xticks(rotation=45)
            st.pyplot(fig1)

        with col2:
            st.subheader("Top Directors")
            fig2, ax2 = plt.subplots(figsize=(10, 6))
            filtered_df['Director'].value_counts().head(10).plot(kind='barh', color='darkred', ax=ax2)
            st.pyplot(fig2)

        st.subheader("Movies by Year")
        fig3, ax3 = plt.subplots(figsize=(12, 4))
        sns.histplot(data=filtered_df, x='Release year', bins=30, kde=True, ax=ax3)
        st.pyplot(fig3)

        if 'Title_Sentiment' in filtered_df.columns:
            st.subheader("Sentiment Distribution")
            fig4, ax4 = plt.subplots(figsize=(12, 4))
            sns.histplot(data=filtered_df, x='Title_Sentiment', bins=20, color='purple', ax=ax4)
            ax4.set_title('Distribution of Movie Title Sentiment')
            ax4.set_xlabel('Sentiment Score (-1 to 1)')
            ax4.set_ylabel('Count')
            st.pyplot(fig4)

    with tab3:
        st.header("Key Statistics")

        col1, col2 = st.columns(2)

        with col1:
            st.subheader("Time Range")
            st.metric("Earliest Film", filtered_df['Release year'].min())
            st.metric("Latest Film", filtered_df['Release year'].max())

            st.subheader("Most Frequent Director")
            top_director = filtered_df['Director'].mode()[0] if not filtered_df['Director'].empty else 'N/A'
            st.metric("Director with most films",
                      f"{top_director} ({filtered_df['Director'].value_counts().max() if not filtered_df['Director'].empty else 0} films)")

            if 'Title_Sentiment' in filtered_df.columns:
                st.subheader("Sentiment Statistics")
                st.metric("Average Sentiment", f"{filtered_df['Title_Sentiment'].mean():.3f}")
                st.metric("Most Positive Title",
                          f"{filtered_df.loc[filtered_df['Title_Sentiment'].idxmax()]['Film']} ({filtered_df['Title_Sentiment'].max():.3f})")
                st.metric("Most Negative Title",
                          f"{filtered_df.loc[filtered_df['Title_Sentiment'].idxmin()]['Film']} ({filtered_df['Title_Sentiment'].min():.3f})")

        with col2:
            st.subheader("Decade Distribution")
            decade_counts = (filtered_df['Release year'] // 10 * 10).value_counts().sort_index()
            st.table(decade_counts)

            st.subheader("Directors with Multiple Films")
            multi_film_directors = filtered_df['Director'].value_counts()[filtered_df['Director'].value_counts() > 1]
            st.table(multi_film_directors)

    with tab4:
        st.header("Movie Titles Word Cloud")
        if not filtered_df['Film'].empty:
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate(
                ' '.join(filtered_df['Film'].dropna()))
            fig, ax = plt.subplots(figsize=(12, 6))
            ax.imshow(wordcloud, interpolation='bilinear')
            ax.axis('off')
            st.pyplot(fig)
        else:
            st.warning("No movie titles available for word cloud.")

    # Footer
    st.markdown("---")
    st.markdown("""
    *Data Source*: [Wikipedia - AFI's 100 Years...100 Movies](https://en.wikipedia.org/wiki/AFI%27s_100_Years...100_Movies)  
    *Analysis*: Interactive exploration of the American Film Institute's top 100 films  
    *Troubleshooting*:  
    - Ensure MongoDB is running on localhost:27017 (run mongod in a terminal) or afi_top_100_cleaned.csv is in the project directory.  
    - Install MongoDB: [MongoDB Installation Guide](https://www.mongodb.com/docs/manual/installation/).  
    - Install dependencies: pip install streamlit pandas pymongo matplotlib seaborn wordcloud textblob.
    """)

else:
    st.error("Failed to load data. Please check MongoDB or CSV availability.")

2025-05-05 09:50:58.964 
  command:

    streamlit run C:\Users\MSI\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-05 09:50:59.061 Session state does not function when running a script without `streamlit run`
