-
Notifications
You must be signed in to change notification settings - Fork 0
/
rating.py
executable file
·211 lines (177 loc) · 6.61 KB
/
rating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/env python3
import csv
from enum import Enum
import json
import random
from typing import Dict, List, Set, Tuple
def invmap(d: Dict) -> Dict:
res = {v: k for (k, v) in d.items()}
if len(d) != len(d):
raise RuntimeError("Dictionary is not bijective!")
return res
class Movie:
def __init__(self, title: str, rating: str) -> None:
self.title = title
self.rating = rating
class Edge(Enum):
WORSE = "worse" # <
BETTER = "better" # >
SAME = "same" # = presumably, movies in same category, but use it any time you know you just can't choose which one is best
INCOMPARABLE = "incomparable" # ? e.g. for movies in completely different categories (e.g) documentary vs action
IGNORE = "ignore" # use ignore if you don't really remember the movie, e.g. for later handling
sym2edge = {
'<': Edge.WORSE,
'>': Edge.BETTER,
'=': Edge.SAME,
'?': Edge.INCOMPARABLE,
'i': Edge.IGNORE,
}
edge2sym = invmap(sym2edge)
class RatingGraph:
def __init__(self, id2movie: Dict[str, Movie], graph: Dict[str, List[str]]) -> None:
self.id2movie = id2movie
self.graph = graph
@classmethod
def load(cls, imdb_fname: str, ratings_fname: str):
id2movie = {}
with open(imdb_fname, 'r') as fo:
reader = csv.DictReader(fo)
for i, line in enumerate(reader):
title = line['Title']
rating = line['You rated']
id_ = "n" + str(i)
id2movie[id_] = Movie(title, rating)
title2id = invmap({k: v.title for k, v in id2movie.items()})
graph = {id_: [] for id_ in id2movie} # type: Dict[str, List[str]]
with open(ratings_fname, 'r') as fo:
for line in fo:
[st, a, b] = line.split(';')
st = st.strip()
a = a.strip()
b = b.strip()
ida = title2id[a]
idb = title2id[b]
edge = None
if st not in sym2edge:
print("State for {}; {} is invalid!".format(a, b))
else:
edge = sym2edge[st]
if edge == Edge.WORSE:
graph[ida].append(idb)
elif edge == Edge.BETTER:
graph[idb].append(ida)
else:
pass # can't do mych at this point?
return cls(id2movie, graph)
# TODO tred: transitive reduction
# dot -Tpng graph.dot -o graph.png
# TODO update dynamically, you can see the nodes moving and laying out.
def plot(self, dot_fname: str) -> None:
for id_ in sorted(self.id2movie.keys(), key=lambda s: int(s[1:])):
print(id_, self.id2movie[id_].title)
dot_contents = ""
def app(s):
nonlocal dot_contents
dot_contents += s + "\n"
app('digraph test {')
app('rankdir=BT;')
app('size="20, 5";')
app('dpi="500";')
app('ratio="fill";')
edges = []
for from_, tos in sorted(self.graph.items()):
for to in tos:
edges.append(' {} -> {};'.format(from_, to))
for e in edges:
app(e)
# dsu = Dsu(mmap.values())
# elif st == Edge.SAME:
# ma = mmap[a]
# mb = mmap[b]
# # dsu.merge(ma, mb)
vcolor = {}
# groups = dsu.get_groups()
# groups = {k: v for (k, v) in groups.items() if len(v) > 1}
# colors = ['yellow', 'red', 'green', 'blue', 'magenta']
# for col, g in zip(colors, groups.values()):
# for v in g:
# vcolor[v] = col
rat2color = {
"10": "red",
"9": "green",
"8": "blue",
"7": "yellow",
}
for id_, movie in self.id2movie.items():
vcolor[id_] = rat2color.get(movie.rating, 'white')
for id_ in self.graph:
# label = name
# rating = ratings[id2name[id_]]
label = id_
fillc = vcolor.get(id_, 'white')
app(' {} [shape=circle, fillcolor={} style=filled label = "{}"];'.format(id_, fillc, label))
app('}')
with open(dot_fname, 'w') as fo:
fo.write(dot_contents)
state_fname = 'state.txt'
ratings_fname = 'ratings.csv'
graph = RatingGraph.load(ratings_fname, state_fname)
class Dsu:
def __init__(self, items: List) -> None:
self.parents = {i: i for i in items}
def merge(self, i, j):
pi = self.get_parent(i)
pj = self.get_parent(j)
if pi == pj:
return
else:
self.parents[pj] = pi
def get_parent(self, i):
pp = self.parents[i]
if pp == i:
return i
else:
return self.get_parent(pp)
def get_groups(self):
groups = {}
for k in self.parents:
rp = self.get_parent(k)
l = groups.get(rp, [])
l.append(k)
groups[rp] = l
return groups
def add_more(seed: int):
stats = {k: len(v) for k, v in graph.graph.items()}
for from_, tos in graph.graph.items():
for to in tos:
stats[to] += 1
stats_by_count = list(p[0] for p in sorted(stats.items(), key = lambda k: (k[1], k[0])))
sample = stats_by_count[:15]
print("Lowest stats:", sample)
gen = random.Random(x=seed)
# sample = gen.sample(all_movies, 7)
# import itertools
# tuples = set((a, b) for (a, b) in itertools.product(sample, sample) if a < b)
# tsample = gen.sample(tuples, 10)
tuples = set()
for a in sample:
b = gen.choice(list(graph.id2movie.keys()))
if a == b or b in graph.graph[a]:
print("COLLISION, SKIPPING {} {}".format(a, b))
continue
ta = graph.id2movie[a].title
tb = graph.id2movie[b].title # TODO get_title
tp = (ta, tb) if ta < tb else (tb, ta)
tuples.add(tp)
with open(state_fname, 'a') as fo:
for a, b in tuples:
fo.write(" ;{};{}\n".format(a, b)) # TODO separator?
graph.plot('graph.dot')
# add_more(4362376737)
# TODO right now, the best movies get priority since they dont have out
# TODO Fake edges to enforce lower bound on marks
# TODO even better strategy: connect unconnected components to build a spanning tree?
# TODO compare with IMDB ratings afterwards. correlation?
# TODO split marks uniformly by weight? So 10% of the movies get 1 point
# TODO I didn't watch some of the movies for a while, so some movies rated 10 end up having low rating.
# perhaps some sort of 'confidence'? Depending on how long ago you watched the movie