# Act 2 Import / Export Basics

## Daten importieren

- Verbindung zu Drive herstellen
- Nötige Libraries importieren

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/gdrive')

- Pfad zum File definieren
- CSV importieren mittels Pandas

In [None]:
file_path = "/content/gdrive/MyDrive/Colab Notebooks/Act_2_Python_Intro/data/movies_imdb.csv"
data = pd.read_csv(file_path)

## Datenstruktur verstehen

- Spalten-Namen ausgeben
- Ersten 5 Einträge ausgeben
- Einen spezifischen Eintrag an Index-Position `n` ausgeben
- Eine spezifische Spalte ausgeben

In [None]:
# Print the contents of the DataFrame
data.columns

In [None]:
# print the first 5 entries
data.head()

In [None]:
data.iloc[10]

In [None]:
data["title"]

### Daten Sortieren

- Daten nach einer Spalte alphabetisch sortieren
- Daten nach einer Spalte nach Werten sortieren

In [None]:
sorted_title_az = data.sort_values(by="title", ascending=True)
sorted_title_az.head()

In [None]:
sorted_rating = data.sort_values(by="imdbRating", ascending=False)
sorted_rating.head()

## Daten Filtern

- Filtere die Daten nach verschiedenen Werten wie dem Jahr oder dem Land.
- Finde alle Filme mit einem Rating höher als 9
- Finde alle Filme zwischen 1990 und 1999
- Finde alle Filme mit dem Genre "Anime"

In [None]:
movies_1989 = data[data['year'] == "1989"]
movies_1989.head()

In [None]:
movies_japan = data[data["country"] == "Japan"]
len(movies_japan)
# movies_japan.head()

In [None]:
all_movies_japan = data[data["country"].str.contains("Japan", na=False)]
len(all_movies_japan)
#all_movies_japan.head()

In [None]:
best_movies = data[data["imdbRating"] >= 9]
best_sorted = best_movies.sort_values(by="imdbRating", ascending=False)
best_sorted.head()

In [None]:
# Convert the 'year' column to numeric, coercing errors (e.g., invalid strings will become NaN)
data['year_numeric'] = pd.to_numeric(data['year'], errors='coerce')

movies_90s = data[(data['year_numeric'] >= 1990) & (data['year_numeric'] <= 1999)]
movies_90s.head()


In [None]:
all_animation = data[data["genre"].str.contains("Animation", na=False)]
all_animation.head()

In [None]:
all_90s_action = movies_90s[(movies_90s["genre"].str.contains("Action", na=False))]
all_90s_action.head()

## Basic Statistics

- Berechne den Mean und Median der Ratings. Der Mittelwert (Mean) ist der Durchschnitt, der berechnet wird, indem alle Werte addiert und durch die Anzahl der Werte geteilt werden. Der Median hingegen ist der mittlere Wert in einer sortierten Datenreihe, der die Daten in zwei Hälften teilt – unabhängig von extremen Ausreißern.
- Berechne den häufigsten Wert (Mode)
- Finde den niedrigsten und höchsten Wert.

In [None]:
# Calculate the mean
mean_rating = data['imdbRating'].mean()

# Calculate the median
median_rating = data['imdbRating'].median()

# Print the results
print(f"Mean IMDb Rating: {mean_rating}")
print(f"Median IMDb Rating: {median_rating}")

In [None]:
# der häufigste wert
data['imdbRating'].mode()

In [None]:
# Min & Max
print(f"Lowest Rating:: {data['imdbRating'].min()}")
print(f"Highest Rating: {data['imdbRating'].max()}")

## Export

- Exportiere einer der gefilterten Datensätze als neues CSV.
- Exportiere den selben Datensatz auch als JSON und Excel File

In [None]:
export_path = '/content/gdrive/MyDrive/Colab Notebooks/Act_2_Python_Intro/export/movies_90s_action.csv'
all_90s_action.to_csv(export_path, index=False)

In [None]:
all_90s_action.to_json('/content/gdrive/MyDrive/Colab Notebooks/Act_2_Python_Intro/export/movies_90s_action.json', orient='records')


In [None]:
all_90s_action.to_excel('/content/gdrive/MyDrive/Colab Notebooks/Act_2_Python_Intro/export/movies_90s_action.xlsx', index=False)


## Visualize Movies per Year

In [None]:
# Count the number of movies for each year
movies_per_year = data['year'].value_counts().sort_index()  # Sort by year for proper ordering


In [None]:
# Plot the bar chart
plt.figure(figsize=(20, 6))
movies_per_year.plot(kind='bar')
plt.title('Number of Movies Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.xticks(rotation=90)
plt.xticks(fontsize=8)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

## Visualize Movies per Country

In [None]:
movies_per_country = data['country'].value_counts()
print(movies_per_country)

In [None]:
# Filter countries with more than 10 movies
filtered_countries = movies_per_country[movies_per_country > 10]
print(filtered_countries)

# Plot the data
plt.figure(figsize=(20, 6))
filtered_countries.plot(kind='bar', color='skyblue', edgecolor='black')

# Add title and labels
plt.title('Number of Movies Per Country (More than 10 Movies)', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Filter rows where 'country' contains 'Switzerland'
switzerland_movies = data[data['country'].str.contains('Switzerland', case=False, na=False)]

# Display the filtered DataFrame
switzerland_movies

In [None]:
# count the movies
switzerland_count = switzerland_movies['year'].value_counts().sort_index()

# Plot the data
plt.figure(figsize=(20, 6))
switzerland_count.plot(kind='bar', color='skyblue', edgecolor='black')

# Add title and labels
plt.title('Number of Movies Per Country (More than 10 Movies)', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()