In [1]:
import setup_django
setup_django.init()


In [2]:
import pandas as pd
from django.conf import settings

ratings_path = settings.DATA_DIR / "ratings_small.csv"
ratings_path.exists()

True

In [3]:
df = pd.read_csv(ratings_path)

In [4]:
df.head(n=10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [5]:
from django.contrib.auth import get_user_model

User = get_user_model()

In [15]:
current_users = User.objects.all().values_list('id', flat=True)
rating_users = df['userId'].tolist()

In [16]:
missing_user_ids = set(rating_users) - set(current_users)
missing_user_ids

set()

In [13]:
for uid in missing_user_ids:
    User.objects.create(
        id=uid,
        username=f"missing-user-{uid}"
    )

In [20]:
import math
from decimal import Decimal

In [22]:
df['value'] = df['rating'].apply(lambda x: math.ceil(Decimal(x)))
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,value
0,1,31,2.5,1260759144,3
1,1,1029,3.0,1260759179,3
2,1,1061,3.0,1260759182,3
3,1,1129,2.0,1260759185,2
4,1,1172,4.0,1260759205,4


In [24]:
df['user_id'] = df['userId']
df['object_id'] = df['movieId']


In [25]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,value,user_id,object_id
0,1,31,2.5,1260759144,3,1,31
1,1,1029,3.0,1260759179,3,1,1029
2,1,1061,3.0,1260759182,3,1,1061
3,1,1129,2.0,1260759185,2,1,1129
4,1,1172,4.0,1260759205,4,1,1172


In [26]:
cols = ['user_id', 'value', 'object_id']
transformed_df = df.copy()[cols]

In [27]:
transformed_df.head()

Unnamed: 0,user_id,value,object_id
0,1,3,31
1,1,3,1029
2,1,3,1061
3,1,2,1129
4,1,4,1172


In [28]:
transformed_df.to_dict('records')

[{'user_id': 1, 'value': 3, 'object_id': 31},
 {'user_id': 1, 'value': 3, 'object_id': 1029},
 {'user_id': 1, 'value': 3, 'object_id': 1061},
 {'user_id': 1, 'value': 2, 'object_id': 1129},
 {'user_id': 1, 'value': 4, 'object_id': 1172},
 {'user_id': 1, 'value': 2, 'object_id': 1263},
 {'user_id': 1, 'value': 2, 'object_id': 1287},
 {'user_id': 1, 'value': 2, 'object_id': 1293},
 {'user_id': 1, 'value': 4, 'object_id': 1339},
 {'user_id': 1, 'value': 2, 'object_id': 1343},
 {'user_id': 1, 'value': 3, 'object_id': 1371},
 {'user_id': 1, 'value': 1, 'object_id': 1405},
 {'user_id': 1, 'value': 4, 'object_id': 1953},
 {'user_id': 1, 'value': 4, 'object_id': 2105},
 {'user_id': 1, 'value': 3, 'object_id': 2150},
 {'user_id': 1, 'value': 2, 'object_id': 2193},
 {'user_id': 1, 'value': 2, 'object_id': 2294},
 {'user_id': 1, 'value': 3, 'object_id': 2455},
 {'user_id': 1, 'value': 1, 'object_id': 2968},
 {'user_id': 1, 'value': 3, 'object_id': 3671},
 {'user_id': 2, 'value': 4, 'object_id': 1

In [38]:
rating_records = transformed_df.to_dict('records')

In [35]:
from ratings.models import Rating
qs = Rating.objects.all()
qs.delete()

(0, {})

In [36]:
from django.contrib.contenttypes.models import ContentType

ctype = ContentType.objects.get(app_label='movies', model='movie')

In [41]:
new_ratings = []
for r in rating_records:
    r['content_type'] = ctype
    new_ratings.append(
        Rating(**r)
    )
    
Rating.objects.bulk_create(new_ratings, ignore_conflicts=True, batch_size=1000)

[<Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: Rating object (None)>,
 <Rating: 

In [42]:
from ratings.tasks import task_update_movie_ratings

task_update_movie_ratings()

Rating update took 0:00:23 (23.109932899475098s)
