# Full Analysis

In [1]:
import pandas as pd

# Data Loading

In [13]:
genome_scores = pd.read_csv(
    '../data/genome_scores.csv', 
    dtype={'movieId': 'int32', 'tagId': 'int32', 'relevance': 'float32'}
)
print(f"Genome Scores loaded: {genome_scores.shape}")

genome_tags = pd.read_csv('../data/genome_tags.csv')
print(f"Genome Tags loaded: {genome_tags.shape}")

link = pd.read_csv('../data/link.csv')
print(f"Links loaded: {link.shape}")

movie = pd.read_csv('../data/movie.csv')
print(f"Movie metadata loaded: {movie.shape}")

rating = pd.read_csv('../data/rating.csv', parse_dates=['timestamp'])
print(f"Ratings loaded: {rating.shape}")

tag = pd.read_csv('../data/tag.csv', parse_dates=['timestamp'])
print(f"User tags loaded: {tag.shape}")

Genome Scores loaded: (11709768, 3)
Genome Tags loaded: (1128, 2)
Links loaded: (27278, 3)
Movie metadata loaded: (27278, 3)
Ratings loaded: (20000263, 4)
User tags loaded: (465564, 4)


# Data Preprocessing

In [15]:
df = rating.merge(movie, on='movieId', how='left')
df['target'] = (rating['rating'] >= 4).astype(int)

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,target
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy,0
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,0
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,0
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,0
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
 4   title      object        
 5   genres     object        
 6   target     int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 1.0+ GB


In [26]:
# See the count of the target classes
print("Class Balance:")
print(df['target'].value_counts(normalize=True))

Class Balance:
target
0    0.500236
1    0.499764
Name: proportion, dtype: float64


In [27]:
# Creating a list of genres for each movie
df['genre_list'] = df['genres'].str.split('|')
df = df.drop('genres', axis=1)

In [28]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,target,genre_list
0,28507,1176,4.0,1995-01-09 11:46:44,"Double Life of Veronique, The (Double Vie de V...",1,"[Drama, Fantasy, Romance]"
1,131160,1079,3.0,1995-01-09 11:46:49,"Fish Called Wanda, A (1988)",0,"[Comedy, Crime]"
2,131160,47,5.0,1995-01-09 11:46:49,Seven (a.k.a. Se7en) (1995),1,"[Mystery, Thriller]"
3,131160,21,3.0,1995-01-09 11:46:49,Get Shorty (1995),0,"[Comedy, Crime, Thriller]"
4,99851,31,5.0,1996-01-29 00:00:00,Dangerous Minds (1995),1,[Drama]


# Data Splitting

In [29]:
# 1. sort by time
df = df.sort_values('timestamp').reset_index(drop=True)

# 2. pick a cutoff - choosing the first 70% of data for training
split_idx = int(len(df) * 0.7)

train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

print(f"Training data date range: {train_df['timestamp'].min()} to {train_df['timestamp'].max()}")
print(f"Testing data date range: {test_df['timestamp'].min()} to {test_df['timestamp'].max()}")

Training data date range: 1995-01-09 11:46:44 to 2007-12-08 01:20:29
Testing data date range: 2007-12-08 01:20:38 to 2015-03-31 06:40:02


In [30]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,target,genre_list
0,28507,1176,4.0,1995-01-09 11:46:44,"Double Life of Veronique, The (Double Vie de V...",1,"[Drama, Fantasy, Romance]"
1,131160,1079,3.0,1995-01-09 11:46:49,"Fish Called Wanda, A (1988)",0,"[Comedy, Crime]"
2,131160,47,5.0,1995-01-09 11:46:49,Seven (a.k.a. Se7en) (1995),1,"[Mystery, Thriller]"
3,131160,21,3.0,1995-01-09 11:46:49,Get Shorty (1995),0,"[Comedy, Crime, Thriller]"
4,85252,22,4.0,1996-01-29 00:00:00,Copycat (1995),1,"[Crime, Drama, Horror, Mystery, Thriller]"


In [31]:
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']

X_test = test_df.drop(columns=['target'])
y_test = test_df['target']

# Feature Engineering

There are three features I want to engineer:
1. **Average rating of this movie from other users**:
2. **Average rating by this user**: some people have a different rating system, ie. some people are 5 average and some people are 7 average
3. **Genre affinity**: how many times did they rate "High" for this genre?

Note: Be aware of data leakage; can only use data that has already happened; cannot use future time to predict the past.