In [3]:
import pandas as pd
import numpy as np
import json
import ast
import csv
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate

In [4]:
#Helper functions
def parse_json_safe(x):
    if pd.isna(x) or x == '':
        return []
    try:
        return json.loads(x) if isinstance(x, str) else x
    except Exception:
        try:
            return ast.literal_eval(x)
        except Exception:
            return []

def pretty_print(df, title=None, n=5):
    if title:
        print(f"\n===== {title} =====")
    print(tabulate(df.head(n), headers="keys", tablefmt="fancy_grid"))
    if len(df) > n:
        print(f"... ({len(df) - n} more rows)")


In [5]:


bad_movie_lines = []
bad_credit_lines = []
bad_keyword_lines = []
bad_links_lines = []
bad_ratings_lines = []

def log_bad_movie(line):
    bad_movie_lines.append(line)
    return None

def log_bad_credit(line):
    bad_credit_lines.append(line)
    return None

def log_bad_keyword(line):
    bad_keyword_lines.append(line)
    return None

def log_bad_links(line):
    bad_links_lines.append(line)
    return None

def log_bad_ratings(line):
    bad_ratings_lines.append(line)

movies = pd.read_csv(
    "../movies/movies_metadata.csv",
    engine="python",
    sep=",",
    quotechar='"',
    on_bad_lines=lambda line: log_bad_movie(line)
)

movie_credits = pd.read_csv(
    "../movies/credits.csv",
    engine="python",
    sep=",",
    quotechar='"',
    on_bad_lines=lambda line: log_bad_credit(line)
)

keywords = pd.read_csv(
    "../movies/keywords.csv",
    engine="python",
    sep=",",
    quotechar='"',
    on_bad_lines=lambda line: log_bad_keyword(line)
)

links_small = pd.read_csv(
    "../movies/links_small.csv",
    engine="python",
    sep=",",
    quotechar='"',
    on_bad_lines=lambda line: log_bad_movie(line)
)

ratings_small = pd.read_csv(
    "../movies/ratings_small.csv",
    engine="python",
    sep=",",
    quotechar='"',
    on_bad_lines=lambda line: log_bad_movie(line)
)

print("Datasets loaded.")
print(f"movies: {movies.shape}")
print(f"credits: {movie_credits.shape}")
print(f"keywords: {keywords.shape}")
print(f"links: {links_small.shape}")
print(f"ratings: {ratings_small.shape}")

print(f"movies skipped lines: {len(bad_movie_lines)}")
print(f"credits skipped lines: {len(bad_credit_lines)}")
print(f"keywords skipped lines: {len(bad_keyword_lines)}")
print(f"links skipped lines: {len(bad_links_lines)}")
print(f"ratings skipped lines: {len(bad_ratings_lines)}")

if bad_movie_lines:
    print("example bad movie line:")
    print(bad_movie_lines[0][:200])
if bad_credit_lines:
    print("example bad credit line:")
    print(bad_credit_lines[0][:200])
if bad_keyword_lines:
    print("example bad keyword line:")
    print(bad_keyword_lines[0][:200])
if bad_links_lines:
    print("example bad links small:")
    print(bad_links_lines[0][:200])

if bad_ratings_lines:
    print("example bad ratings small:")
    print(bad_ratings_lines[0][:200])


Datasets loaded.
movies: (2803, 24)
credits: (365, 3)
keywords: (46419, 2)
links: (9125, 3)
ratings: (100004, 4)
movies skipped lines: 0
credits skipped lines: 0
keywords skipped lines: 0
links skipped lines: 0
ratings skipped lines: 0


In [6]:
print("hæstkommode")

hæstkommode


In [7]:
print('hestebanan')

hestebanan


In [8]:
print("oppdaterforjensmartingjahle")

oppdaterforjensmartingjahle
