In [7]:
import pandas as pd
import csv
import matplotlib.pyplot as plt

# ===============================
# 1. Wczytywanie danych
# ===============================

movies_path = "tmdb_movies (2).csv"
genres_path = "tmdb_genres (2).csv"

# --- Filmy ---
df_movies = pd.read_csv(
    movies_path,
    sep=",",
    engine="python",
    quoting=csv.QUOTE_MINIMAL,
    on_bad_lines="skip"
)

# Ostatnia kolumna zawiera genre_id – nadajemy nazwę jawnie
df_movies = df_movies.rename(columns={df_movies.columns[-1]: "genre_id"})

# --- Gatunki ---df_genres = pd.read_csv(genres_path, header=None)

df_genres = pd.read_csv(genres_path, header=None)

# jawne nadanie nazw kolumn
df_genres.columns = ["genre_id", "genre"]





# ===============================
# 2. Poprawa typów i czyszczenie
# ===============================

df_movies["genre_id"] = pd.to_numeric(df_movies["genre_id"], errors="coerce")
df_genres["genre_id"] = pd.to_numeric(df_genres["genre_id"], errors="coerce")

df_movies = df_movies.dropna(subset=["genre_id"])
df_genres = df_genres.dropna(subset=["genre_id"])

df_movies["genre_id"] = df_movies["genre_id"].astype(int)
df_genres["genre_id"] = df_genres["genre_id"].astype(int)

# ===============================
# 3. Połączenie tabel
# ===============================

merged = df_movies.merge(df_genres, on="genre_id", how="left")

# ===============================
# 4. Top 10 filmów (vote_count > Q3)
# ===============================

merged["vote_count"] = pd.to_numeric(merged["vote_count"], errors="coerce")
merged["vote_average"] = pd.to_numeric(merged["vote_average"], errors="coerce")

q3 = merged["vote_count"].quantile(0.75)

top10 = (
    merged[merged["vote_count"] > q3]
    .sort_values("vote_average", ascending=False
    .head(10)
)

print("\nTOP 10 FILMÓW:")
print(top10[["title", "vote_average", "vote_count", "genre"]])

# ===============================
# 5. Wykres revenue + budget (2010–2016)
# ===============================

merged["release_year"] = pd.to_numeric(
    merged["release_date"].str[:4],
    errors="coerce"
)

merged["revenue"] = pd.to_numeric(merged["revenue"], errors="coerce")
merged["budget"] = pd.to_numeric(merged["budget"], errors="coerce")

subset = merged[
    (merged["release_year"] >= 2010) &
    (merged["release_year"] <= 2016)
]

grouped = subset.groupby("release_year")[["revenue", "budget"]].mean()

plt.figure(figsize=(10, 6))

plt.bar(grouped.index, grouped["revenue"], label="Średni revenue")
plt.plot(grouped.index, grouped["budget"], linewidth=2, label="Średni budget")

plt.xlabel("Rok")
plt.ylabel("Średnie wartości (USD)")
plt.title("Średni revenue (kolumny) i budget (linia) dla lat 2010–2016")

# Legenda w prawym górnym rogu płótna, poza osiami
plt.legend(
    loc="upper left",
    bbox_to_anchor=(1.02, 1),
    borderaxespad=0
)

plt.tight_layout()
plt.show()

# ===============================
# 6. Najczęstszy gatunek
# ===============================

most_common_genre = merged["genre"].value_counts().head(1)

print("\nNAJCZĘSTSZY GATUNEK:")
print(most_common_genre)

# ===============================
# 7. Gatunek o najdłuższym średnim runtime
# ===============================

merged["runtime"] = pd.to_numeric(merged["runtime"], errors="coerce")

avg_runtime = (
    merged
    .dropna(subset=["genre", "runtime"])
    .groupby("genre")["runtime"]
    .mean()
    .sort_values(ascending=False)
)

longest_genre = avg_runtime.index[0]

print("\nGATUNEK O NAJDŁUŻSZYM ŚREDNIM CZASIE TRWANIA:")
print(f"{longest_genre} — {avg_runtime.iloc[0]:.2f} minut")

# ===============================
# 8. Histogram czasu trwania
# ===============================

hist_data = merged[
    merged["genre"] == longest_genre
]["runtime"].dropna()

plt.figure(figsize=(8, 5))
plt.hist(hist_data, bins=20)
plt.xlabel("Runtime (min)")
plt.ylabel("Frequency")
plt.title(f"Histogram czasu trwania filmów – gatunek: {longest_genre}")
plt.tight_layout()
plt.show()

SyntaxError: '(' was never closed (980854964.py, line 63)