In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## 1. Data Processing
Data Reading and cleaning and merging

### 1.1 Data Read

In [2]:
data_movies = pd.read_csv('movies.csv')
data_ratings = pd.read_csv('ratings.csv')

In [3]:
data_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
data_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
data_movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [6]:
data_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [7]:
# data_movies.movieId.value_counts(),print('Number of duplicated unique ids are: ',data_movies.movieId.duplicated().sum())

In [8]:
# data_ratings.movieId.value_counts()

### 1.2 Merge

In [9]:
merge = data_movies.merge(data_ratings,on = 'movieId',how = 'inner')

In [10]:
merge

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [11]:
merge.drop(columns=['title','genres', 'timestamp'],inplace=True)

In [12]:
merge

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,1,5,4.0
2,1,7,4.5
3,1,15,2.5
4,1,17,4.5
...,...,...,...
100831,193581,184,4.0
100832,193583,184,3.5
100833,193585,184,3.5
100834,193587,184,3.5


In [13]:
cols = ['userId', 'movieId', 'rating']

In [14]:
merge = merge[cols]

In [15]:
merge.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


In [16]:
merge.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [17]:
# len(merge.userId.unique()) 

In [18]:
# merge_list = merge.groupby(by = ["userId"])["title"].apply(list).reset_index()

In [19]:
# merge_list.head()

### 1.3 reaname dataframe cols

In [20]:
merge = merge.rename(columns={'userId': 'UserID', 'movieId':'MovieID', 'rating':'Rating'})

In [21]:
merge.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


### 1.4 Data Visualization

In [22]:
import sweetviz as sviz
import datetime

In [23]:
analyze_report = sviz.analyze(merge)
now = datetime.datetime.now()
f_name = f'datasetviz/dataset-1_{now.year}_{now.month}_{now.day}_{now.hour}_{now.minute}_{now.second}.html'
analyze_report.show_html(f_name, open_browser=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=4.0), HTML(value='')), la…


Report datasetviz/dataset-1_2021_8_2_3_15_21.html was generated.


### 1.4 Data Selection - Define hyperparameters
such as min_movie_rating, no_of_data_row

In [63]:

## Min Movie Rating range [0, 5]
min_movie_rating = 3

## No_of_data_row == userID
no_of_data_row = 20


## Min Support range [0.0, 1.0]
min_support_ratio = 0.75

## Confidence range [0.0, 1.0]
min_confidence_ratio = 0.5

In [64]:
all_ratings = merge.copy(deep=True)

# all_ratings.head()

## Choosing >3 ratings only, as 'Favorable Ratings'
all_ratings["Favorable"] = all_ratings["Rating"] > min_movie_rating

## for how many users? This is a point to modify dataset size

ratings = all_ratings[all_ratings['UserID'].isin(range(no_of_data_row))]

favorable_ratings = ratings[ratings["Favorable"]]

# print("all_ratings len: ", len(all_ratings))
# print("ratings len: ", len(ratings))
# print("favorable_ratings len: ", len(favorable_ratings))

### 1.5 All Movies Group by User -- Itemset
#movie ids grouped by userid. this is the itemset

favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])

# len(favorable_reviews_by_users)

# favorable_reviews_by_users

data_list = [list(v.values) for k, v in favorable_ratings.groupby("UserID")["MovieID"] ]

data_tuple = [tuple(v.values) for k, v in favorable_ratings.groupby("UserID")["MovieID"] ]

# print(data_list)
# print(data_tuple)

### 1.6 Getting Movie Names from MovieID

movie_name_filename = 'movies.csv'
movie_name_data = pd.read_csv(movie_name_filename, encoding = "mac-roman")
movie_name_data.drop(columns=['genres'], inplace=True)
movie_name_data.rename(columns={'movieId':'MovieID', 'title': 'Title'}, inplace=True)


movie_name_data.head()

def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

get_movie_name(4)

#---
#---

## 2. Association Implementation

### 2.1 Define Hyperparameter for Support and Confidence

# ## Min Support range [0.0, 1.0]
# min_support_ratio = 0.2

# ## Confidence range [0.0, 1.0]
# min_confidence_ratio = 0.5

#---
#---

### 2.2 MLXTEND -- for time & checking purpose

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(data_list).transform(data_list)
mlxtnd_df = pd.DataFrame(te_ary, columns=te.columns_)

te_ary

from mlxtend.frequent_patterns import apriori

%timeit -n 10 -r 10 apriori(mlxtnd_df, min_support=min_support_ratio)

ap = apriori(mlxtnd_df, min_support=min_support_ratio)
# ap

from mlxtend.frequent_patterns import fpgrowth

%timeit -n 10 -r 10 fpgrowth(mlxtnd_df, min_support=min_support_ratio)

fp = fpgrowth(mlxtnd_df, min_support=min_support_ratio)
# fp

#---
#---

2.5 ms ± 65.3 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)
2.3 ms ± 119 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)
