In [2]:
import pandas as pd
import shared_functions.pickling as pickling

## Read Data

Below, I read in the review topic probability distribution in `doc_topics` and as well as the `topic_labels`, which will
be in the same order.

I then read in all lemmatized reviews.

In [7]:
doc_topics, topic_labels = pickling.get_pickle('glda')
topic_columns = [t.replace(' ', '_') for t in topic_labels]

number = 5933
lemmatized_reviews = pd.read_csv(f'../data/reviews/lemmatized_reviews/lemmatized_reviews_{number}.csv', index_col=0)
lemmatized_reviews.head()


In [9]:
def get_rating(rating_string):
    """
    Returns the integer rating.

    Parameters:
    rating_string (string): string in the form "<rating>/10"

    Returns:
    integer value of rating

   """
    return int(rating_string.split('/')[0])

def get_rating_weights(review_row, topic_dist):
    """
    Appends the topic probabilities for each review to a dictionary containing a user's rating for a movie.

    Parameters:
    review_row (Series): row containing one user review/rating for a movie
    topic_dist (numpy.ndarray): array containing the topic probability distribution. sums to 1.0.

    Returns:
    dictionary in the form {'tconst: <str>, 'username': <str>, 'rating': <int>, acting: '<float>', ... (and so on for all topics)}

   """
    tconst = review_row.tconst
    username = review_row.username
    rating = get_rating(review_row.rating)

    rating_weight_row = {'tconst': tconst, 'username': username, 'rating': rating}
    rating_weight_row.update(zip(topic_columns, topic_dist))
    return rating_weight_row

weighted_ratings = pd.DataFrame([get_rating_weights(row, doc_topics[0]) for _, row in lemmatized_reviews.iterrows()])
weighted_ratings.head()

Unnamed: 0,tconst,username,rating,acting,attraction,cinematography,dialogue,directing,editing_effects,it_factor,plot,sound_music,theme
0,tt0000574,David-240,10,0.055154,0.036124,0.32075,0.000323,0.004124,0.001065,0.000813,0.006387,0.000359,0.5749
1,tt0000574,F Gwynplaine MacIntyre,10,0.055154,0.036124,0.32075,0.000323,0.004124,0.001065,0.000813,0.006387,0.000359,0.5749
2,tt0000574,ackstasis,9,0.055154,0.036124,0.32075,0.000323,0.004124,0.001065,0.000813,0.006387,0.000359,0.5749
3,tt0000574,Ziggy5446,10,0.055154,0.036124,0.32075,0.000323,0.004124,0.001065,0.000813,0.006387,0.000359,0.5749
4,tt0000574,Fella_shibby,8,0.055154,0.036124,0.32075,0.000323,0.004124,0.001065,0.000813,0.006387,0.000359,0.5749


In [13]:
weighted_ratings.to_csv(f'../data/ratings/weighted_ratings_{number}.csv')
