In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [14]:
path = './../../data/rotten_tomato/'
os.listdir(path)

['emotion_analysis_data',
 'final_rating(sentiment,emotion).csv',
 'model.pth',
 'model_new.pth',
 'model_rotten.pth',
 'model_save',
 'movie1_neg.txt',
 'movie1_pos.txt',
 'movie2_neg.txt',
 'movie2_pos.txt',
 'movie3_neg.txt',
 'movie3_pos.txt',
 'rotten_movie_table.csv',
 'rotten_rating_review_emotion_table.csv',
 'rotten_rating_review_emotion_table.xlsx',
 'rotten_rating_review_sentiment_table.csv',
 'rotten_rating_review_sentiment_table.xlsx',
 'rotten_rating_review_table.csv',
 'rotten_review_scaled_clean.csv',
 'rotten_review_scaled_label(pos,neu,neg).csv',
 'rotten_review_scaled_label.csv',
 'rotten_review_scaled_new.csv',
 'rotten_tomatoes_critic_reviews.csv',
 'rotten_tomatoes_movies.csv',
 'sentiment_analysis_data']

In [15]:
ratings_df = pd.read_csv(os.path.join(path + 'final_rating(sentiment,emotion).csv'), encoding='utf-8')

In [16]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,review_score,review_content,review_type,review_date,critic_name,top_critic,publisher_name,sentiment,emotion
0,943,0,0.7,Whether audiences will get behind The Lightnin...,Fresh,2010-02-09,Ben McEachen,False,Sunday Mail (Australia),2,2
1,7242,0,0.25,Harry Potter knockoffs don't come more transpa...,Rotten,2010-02-10,Nick Schager,False,Slant Magazine,0,2
2,1046,0,0.7,"Percy Jackson isn't a great movie, but it's a ...",Fresh,2010-02-10,Bill Goodykoontz,True,Arizona Republic,3,2
3,4895,0,0.7,"Fun, brisk and imaginative",Fresh,2010-02-10,Jordan Hoffman,False,UGO,4,2
4,4517,0,0.6,"Crammed with dragons, set-destroying fights an...",Fresh,2010-02-10,Jim Schembri,True,The Age (Australia),1,2


In [17]:
print(ratings_df.shape)

(752664, 11)


In [8]:
len(set(ratings_df.user_id))

9820

In [9]:
len(set(ratings_df.movie_id))

17614

In [10]:
len(ratings_df)

752664

## 1. 전처리

- 문제점: uid, gid, review가 중복되는 경우 (79,267개, 10%)
- 사용자가 동일 영화를 여러번 보고 중복 리뷰를 남김
- 해결: 마지막 리뷰, 평점만 남기고 제거함.

In [11]:
clean_ratings_df = ratings_df.drop_duplicates(subset=['user_id', 'movie_id'], keep='last')

In [12]:
len(clean_ratings_df)

673397

In [13]:
print(len(ratings_df) - len(clean_ratings_df))

79267


In [14]:
train_df, test_df = train_test_split(clean_ratings_df, test_size=0.2, random_state=1234)

print(train_df.shape)
print(test_df.shape)

(538717, 10)
(134680, 10)


In [15]:
train_df.head()

Unnamed: 0,user_id,movie_id,review_score,review_content,review_type,review_date,critic_name,top_critic,publisher_name,sentiment
75856,1527,2496,0.6,"For reasons too spoilery to give away, Fassben...",Fresh,2017-05-06,Cath Clarke,True,Time Out,3
669438,8085,15663,0.4,The Armenian people deserve a better film brin...,Rotten,2017-12-05,Robert Kojder,False,Flickering Myth,2
73967,3223,2457,0.75,Hollywood has finally realized academic compet...,Fresh,2006-05-19,Forrest Hartman,False,Reno Gazette-Journal,3
511025,8010,12984,1.0,An intelligently crafted celebration of intros...,Fresh,2007-04-09,Rob Gonsalves,False,eFilmCritic.com,4
245694,7646,6996,0.2,"Watching ""G.I. Joe"" is like being slapped acro...",Rotten,2009-08-07,Peter Sobczynski,False,eFilmCritic.com,0


In [16]:
user_ids = sorted(list(set(train_df['user_id'].values)))
movie_ids = sorted(list(set(train_df['movie_id'].values)))

print(f'유저 수: {len(user_ids)}, 영화 수: {len(movie_ids)}')
# print(user_ids)
# print(movie_ids)

유저 수: 9186, 영화 수: 17499


In [17]:
# train_df의 movieId를 기준으로 groupBy
grouped = train_df.groupby('movie_id')
grouped['user_id'].count()

movie_id
0         91
1         72
2          9
3         28
4          8
        ... 
17706     45
17708    172
17709      4
17710      4
17711      2
Name: user_id, Length: 17499, dtype: int64

## 2. 데이터셋 분할
- Time기준으로 8:2
- test에는 존재하지만, train에는 없는 영화, 사용자 비율

In [18]:
sparse_matrix = train_df.groupby('movie_id').apply(lambda x: pd.Series(x['review_score'].values, index=x['user_id'])).unstack()
sparse_matrix.index.name = 'movie_id'

sparse_matrix

user_id,0,1,2,3,4,5,6,7,8,9,...,9811,9812,9813,9814,9815,9816,9817,9818,9819,9820
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,0.9,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17706,,,,,,,,,,,...,,,,,,,,,,
17708,,,,,,,,,,,...,,,,1.0,,,,,,
17709,,,,,,,,,,,...,,,,,,,,,,
17710,,,,,,,,,,,...,,,,,,,,,,
