In [9]:
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def load_config():
    return {
        'dbname': 'postgres',
        'user': 'postgres',
        'password': 'Laiba786',
        'host': 'localhost',
        'port': '5432'
    }

def get_user_ratings_boxoffice_data():
    config = load_config()
    try:
        with psycopg2.connect(**config) as conn:
            with conn.cursor() as cur:
                query = """
                SELECT 
                    movie_table.movie_id,
                    movie_table.worldwideboxoffice,
                    AVG(user_reviews.reviewscore) AS avg_reviewscore
                FROM movie_table
                JOIN user_reviews ON movie_table.movie_id = user_reviews.movie_id
                GROUP BY movie_table.movie_id
                """
                cur.execute(query)
                
                rows = cur.fetchall()
                df_ratings_boxoffice = pd.DataFrame(rows, columns=['movie_id', 'worldwide_box_office', 'avg_reviewscore'])

                print("Data before handling missing values:")
                print(df_ratings_boxoffice.head())

                print("Missing values before filling:")
                print(df_ratings_boxoffice.isna().sum())
                
                # Fill missing values for avg_reviewscore with the mean
                df_ratings_boxoffice['avg_reviewscore'] = df_ratings_boxoffice['avg_reviewscore'].fillna(df_ratings_boxoffice['avg_reviewscore'].mean())
                
                # Fill missing values for worldwide_box_office with 0 (or you can choose to fill with the mean as well)
                df_ratings_boxoffice['worldwide_box_office'] = df_ratings_boxoffice['worldwide_box_office'].fillna(0)

                print("Missing values after filling:")
                print(df_ratings_boxoffice.isna().sum())

                return df_ratings_boxoffice

    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        return None

def calculate_correlation(df_ratings_boxoffice):
    correlation = df_ratings_boxoffice['worldwide_box_office'].corr(df_ratings_boxoffice['avg_reviewscore'])
    print(f"Correlation between Average User Ratings and Worldwide Box Office: {correlation}")
    return correlation

def plot_correlation(df_ratings_boxoffice):
    sns.set(style="whitegrid")
    
    plt.figure(figsize=(10, 6))
    sns.regplot(x='avg_reviewscore', y='worldwide_box_office', data=df_ratings_boxoffice, scatter_kws={'s': 10}, line_kws={"color":"red"})
    
    plt.title('Correlation between Average User Ratings and Worldwide Box Office')
    plt.xlabel('Average User Ratings')
    plt.ylabel('Worldwide Box Office ($)')
    
    plt.show()

if __name__ == '__main__':
    df_ratings_boxoffice = get_user_ratings_boxoffice_data()
    if df_ratings_boxoffice is not None:
        print(df_ratings_boxoffice.head())
        
        correlation = calculate_correlation(df_ratings_boxoffice)
        
        plot_correlation(df_ratings_boxoffice)


Data before handling missing values:
   movie_id  worldwide_box_office avg_reviewscore
0     18803           197618160.0            None
1     31297                   NaN            None
2     34261                   NaN            None
3     31789                   NaN            None
4     33957                   NaN            None
Missing values before filling:
movie_id                   0
worldwide_box_office    4642
avg_reviewscore         9509
dtype: int64
name 'df_ratings_boxoffic' is not defined
