-
Notifications
You must be signed in to change notification settings - Fork 0
/
populate_db.py
89 lines (72 loc) · 2.49 KB
/
populate_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import json
from sqlalchemy import (Table, Column, Integer,
Unicode, MetaData,
ForeignKey, create_engine)
from application import ENGINE_URL
metadata = MetaData()
actors = Table('actors', metadata,
Column('actor_id', Integer, primary_key=True),
Column('name', Unicode(100))
)
movies = Table('movies', metadata,
Column('movie_id', Integer, primary_key=True),
Column('name', Unicode(100))
)
acts = Table('acts', metadata,
Column('actor_id', Integer, ForeignKey('actors.actor_id')),
Column('movie_id', Integer, ForeignKey('movies.movie_id'))
)
engine = create_engine(ENGINE_URL, echo=True)
def create_all():
metadata.create_all(engine)
def get_json(filename):
records = []
with open('TMDB/{}'.format(filename)) as data_file:
record_str = ""
depth = 0
while True:
char = data_file.read(1)
if not char:
break
if char == "{":
depth += 1
elif char == "}":
depth -= 1
record_str += char
if depth == 0:
if record_str.isspace():
continue
print record_str
record = json.loads(record_str)
records.append(record)
record_str = ""
return records
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in xrange(0, len(l), n):
yield l[i:i+n]
def insert_in_chunks(table, items_to_insert):
conn = engine.connect()
for chunk in chunks(items_to_insert, 5000):
conn.execute(table.insert(), chunk)
def insert_people():
people = get_json("TMDBPersonInfo")
to_insert = []
for person in people:
name = person["name"]
person_id = person["personId"]
to_insert.append({"actor_id": person_id, "name": name})
insert_in_chunks(actors, to_insert)
def insert_movies_and_acts():
movies_json = get_json("TMDBMovieInfo")
to_insert_movies = []
to_insert_acts = []
for movie in movies_json:
name = movie["title"]
movie_id = movie["id"]
to_insert_movies.append({"movie_id": movie_id, "name": name})
for actor in movie["cast"]:
actor_id = actor["personId"]
to_insert_acts.append({"movie_id": movie_id, "actor_id": actor_id})
insert_in_chunks(movies, to_insert_movies)
insert_in_chunks(acts, to_insert_acts)