## Step 1: Import Required Libraries

In [2]:
import os
import pandas as pd
import numpy as np
# For API calls
import requests
# For database storage
import sqlite3
import matplotlib as plt

## Step 2: Load APIs

In [17]:
# Import necessary libraries
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Retrieve API keys from environment variables
tmdb_api_key = os.getenv("TMDB_API_KEY")
omdb_api_key = os.getenv("OMDB_API_KEY")

# Print confirmation (DO NOT print actual keys for security)
if tmdb_api_key and omdb_api_key:
    print("API keys loaded successfully!")
else:
    print("Error: API keys not found. Check your .env file.")


API keys loaded successfully!


# Step 3: Load Data

In [18]:
# url = "https://api.themoviedb.org/3/discover/movie?include_adult=false&include_video=false&language=en-US&page=1&sort_by=popularity.desc"

# headers = {
#     "accept": "application/json",
#     "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI1Y2EyYThiYjZjNjJkMDVjMzQ1MWVlMTg1YTZiMWNjNCIsIm5iZiI6MTczODUwNjk4My45NDgsInN1YiI6IjY3OWY4MmU3YTg5MzU5MTQxNTk1NWY5NiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ._vloirPC7PjxLeXqvaSkl-O89spFYEDJtQMXYCdlmWY"
# }

# response = requests.get(url, headers=headers)

# print(response.text)


# url = "https://api.themoviedb.org/3/authentication"

# headers = {
#     "accept": "application/json",
#     "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI1Y2EyYThiYjZjNjJkMDVjMzQ1MWVlMTg1YTZiMWNjNCIsIm5iZiI6MTczODUwNjk4My45NDgsInN1YiI6IjY3OWY4MmU3YTg5MzU5MTQxNTk1NWY5NiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ._vloirPC7PjxLeXqvaSkl-O89spFYEDJtQMXYCdlmWY"
# }

# response = requests.get(url, headers=headers)

# print(response.text)

# # # Define the TMDb API URL
# tmdb_url = f"https://api.themoviedb.org/3/movie/550?api_key={tmdb_api_key}"

# # # Make the request
# response = requests.get(tmdb_url)

# # # Check the response
# if response.status_code == 200:
#     data = response.json()
#     print("Successfully retrieved data from TMDb API!")
#     print(data["results"][:2])  # Print the first two movie entries
# else:
#     print("Error:", response.status_code, response.text)

# # Define the OMDb API URL
# omdb_url = f"http://www.omdbapi.com/?apikey={omdb_api_key}   
# Define a sample movie title
movie_title = "Inception"

# Define the OMDb API URL
omdb_url = f"http://www.omdbapi.com/?apikey={omdb_api_key}&t={movie_title}"

# Make the request
response = requests.get(omdb_url)

# Check the response
if response.status_code == 200:
    movie_data = response.json()
    print(f"Successfully retrieved data for {movie_title} from OMDb API!")
    print(movie_data)
else:
    print("Error:", response.status_code, response.text)


Error: 401 {"Response":"False","Error":"Invalid API key!"}


# Step 3: Data Cleaning

In [None]:
# Clean movie metadata
def clean_movie_data(df):
    """
    Handle missing values and standardize column names in the movie dataset.
    """
    df.dropna(subset=["title", "release_year"], inplace=True)
    df.fillna({"box_office": 0, "runtime": df["runtime"].median()}, inplace=True)
    return df

# Clean speech transcripts (stretch goal)
# def preprocess_speech_text(text):
#     """
#     Tokenize and clean Oscar acceptance speech text for word frequency analysis.
#     """
#     nltk.download("stopwords")
#     nltk.download("punkt")
#     tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
#     filtered_words = [word for word in tokens if word.isalnum() and word not in stopwords.words("english")]
#     return " ".join(filtered_words)

# speech_df["cleaned_speech"] = speech_df["speech_text"].apply(preprocess_speech_text)



In [None]:
# Step 4: Store Data in SQLite Database
conn = sqlite3.connect("academy_awards.db")
awards_df.to_sql("awards", conn, if_exists="replace", index=False)
speech_df.to_sql("speeches", conn, if_exists="replace", index=False)



In [None]:
# Step 5: SQL Queries & Analysis
## Query genres of Best Picture winners over decades
query = """
SELECT genre, COUNT(*) AS num_wins, strftime('%Y', award_year) AS decade
FROM awards
WHERE category = 'Best Picture'
GROUP BY genre, decade
ORDER BY decade ASC;
"""
genre_trends_df = pd.read_sql(query, conn)

## Query word frequency in acceptance speeches
query = """
SELECT cleaned_speech FROM speeches;
"""
speech_texts = pd.read_sql(query, conn)



In [None]:
# Step 6: Data Visualization
## Bar Chart - Best Picture Wins by Genre
plt.figure(figsize=(12,6))
sns.barplot(x="genre", y="num_wins", hue="decade", data=genre_trends_df)
plt.xticks(rotation=45)
plt.title("Best Picture Wins by Genre Over Decades")
plt.show()

## Scatter Plot - Box Office vs IMDb Ratings
plt.figure(figsize=(10,5))
sns.scatterplot(x="box_office", y="imdb_rating", hue="decade", data=awards_df)
plt.title("Box Office Revenue vs IMDb Ratings for Oscar Winners")
plt.show()

## Word Cloud - Common Words in Acceptance Speeches
all_text = " ".join(speech_texts["cleaned_speech"])
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_text)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Common Words in Oscar Acceptance Speeches")
plt.show()



In [None]:
# Step 7: Conclusion & Interpretation
"""
- The bar chart shows which genres have dominated the Best Picture category over time.
- The scatter plot identifies any correlation between box office revenue and audience reception (IMDb ratings).
- The word cloud highlights common themes in Oscar speeches, reflecting industry trends and sentiments. (stretch goal)
"""



### Overview of the Analysis (examples)
- In this analysis, we explored the relationship between the race of law enforcement officers and the race of the drivers they stop. Our goal was to see if there’s any indication of bias in traffic stops based on the racial identity of the officers. To do this, we used a chi-squared test for independence, which helps us understand whether there’s a meaningful connection between these two groups.

### Results of the Chi-Squared Test
- **Chi-Squared Statistic:** We calculated a chi-squared statistic of 122.92. This high number shows that there’s a significant difference between the actual number of stops for different racial groups and what we would expect to see if there were no connection between the officer's race and the driver's race. In other words, this suggests that the patterns we observe in the data are unlikely to be just a coincidence.

- **P-Value:** The p-value we found was about 8.20e-17, which is extremely low. This tells us that the result is statistically significant since it’s much lower than the usual thresholds (like 0.05 or 0.01). A low p-value means we have strong evidence against the idea that there’s no connection between the officer's race and the driver's race.

### Interpretation of Findings
- The results show a strong connection between the race of the officer and the race of the driver being stopped. This means that a driver's chances of being stopped may change depending on the officer's race, suggesting there might be some bias in how traffic stops are carried out.

### Implications
- These findings are important for understanding how race plays a role in law enforcement. They suggest that different racial groups might be treated differently by officers during traffic stops. It's crucial to address these biases to ensure fairness and equality in policing.

### Conclusion
- The strong evidence from the chi-squared statistic and p-value emphasizes the importance of further examining law enforcement practices. Police leaders and community advocacy groups should take these findings into account when reviewing policies and training programs designed to reduce racial bias in policing.