<a href="https://colab.research.google.com/github/jeki15/Recommender-System-using-CBF-and-NCF/blob/main/cbf_recsys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sistem Rekomendasi Destinasi Wisata di Kota Surabaya Menggunakan Metode *Content-Based Filtering*

In [None]:
import pandas as pd
import numpy as np
import re
import string

## Open File

In [None]:
df=pd.read_excel("/content/data cbf drop kolom.xlsx")

In [None]:
df.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category
0,1,Ekowisata Mangrove Wonorejo,Hutan Wisata Mangrove Surabaya merupakan wisat...,Cagar Alam
1,2,Taman Harmoni Keputih,Tempat tersebut ialah Taman Hatmoni Keputih Su...,Cagar Alam
2,3,Air Mancur Menari,Jembatan Kenjeran dengan air mancur menarinya ...,Taman Hiburan
3,4,Taman Prestasi,Taman Prestasi Surabaya merupakan salah satu t...,Taman Hiburan
4,5,Monumen Kapal Selam,"Monumen Kapal Selam, atau disingkat Monkasel, ...",Budaya


In [None]:
rating=pd.read_excel('/content/data rating cbf & ncf.xlsx')

In [None]:
rating.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,393,5
1,1,405,5
2,1,428,3
3,2,413,3
4,2,437,5


In [None]:
from sklearn.model_selection import train_test_split

# Need to map Place_Id to [1, num_places]
Place_Id_to_new_id = dict()
id = 1
for index, row in rating.iterrows():
    if Place_Id_to_new_id.get(row['Place_Id']) is None:
        Place_Id_to_new_id[row['Place_Id']] = id
        rating.at[index, 'Place_Id'] = id
        id += 1
    else:
        rating.at[index, 'Place_Id'] = Place_Id_to_new_id.get(row['Place_Id'])

num_users = len(rating.User_Id.unique())
num_places = len(rating.Place_Id.unique())

In [None]:
print(num_users)
print(num_places)

293
46


In [None]:
rating.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,1,5
1,1,2,5
2,1,3,3
3,2,4,3
4,2,5,5


In [None]:
rating=rating.drop_duplicates()

In [None]:
rating[rating.duplicated(subset=['User_Id', 'Place_Id'], keep=False)]

Unnamed: 0,User_Id,Place_Id,Place_Ratings
4,2,5,5
6,2,5,4
55,17,17,3
56,17,17,4
72,20,35,4
...,...,...,...
1005,288,5,4
1028,297,31,5
1036,297,31,2
1042,299,6,1


In [None]:
idx=rating.groupby(['User_Id', 'Place_Id'])['Place_Ratings'].idxmax()
rating=rating.loc[idx].reset_index(drop=True)
rating[rating.duplicated(subset=['User_Id', 'Place_Id'], keep=False)]

Unnamed: 0,User_Id,Place_Id,Place_Ratings


In [None]:
rating.duplicated().sum()

0

In [None]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   User_Id        998 non-null    int64
 1   Place_Id       998 non-null    int64
 2   Place_Ratings  998 non-null    int64
dtypes: int64(3)
memory usage: 23.5 KB


In [None]:
# Menghitung frekuensi nilai di kolom 'A'
value_counts = rating['User_Id'].value_counts()

# Memilih nilai yang muncul lebih dari 1 kali
values_to_keep = value_counts[value_counts > 4].index

# Memfilter DataFrame untuk hanya menyimpan baris dengan nilai di kolom 'A' yang muncul lebih dari 3 kali
rating = rating[rating['User_Id'].isin(values_to_keep)]

In [None]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
Index: 402 entries, 10 to 993
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   User_Id        402 non-null    int64
 1   Place_Id       402 non-null    int64
 2   Place_Ratings  402 non-null    int64
dtypes: int64(3)
memory usage: 12.6 KB


In [None]:
# rating.to_excel('data ratinig surabaya.xlsx')

## Preprocessing Text

### Penggabungan Atribut yang Digunakan dari Destinasi Wisata

In [None]:
df["Metadata"] = df[["Place_Name","Description","Category"]].apply(lambda x: " ".join(x),axis=1)
df_content=df[["Place_Id", "Place_Name", "Metadata"]]

In [None]:
df_content.tail()

Unnamed: 0,Place_Id,Place_Name,Metadata
33,40,Taman Hiburan Rakyat,Taman Hiburan Rakyat Taman Hiburan Rakyat atau...
34,41,Taman Mundu,Taman Mundu Taman Mundu merupakan salah satu t...
35,42,Museum Mpu Tantular,Museum Mpu Tantular Museum Negeri Mpu Tantular...
36,45,Taman Flora Bratang Surabaya,Taman Flora Bratang Surabaya Taman Flora adala...
37,46,Gereja Perawan Maria Tak Berdosa Surabaya,Gereja Perawan Maria Tak Berdosa Surabaya Gere...


### *Case Folding*

In [None]:
df_content["Metadata_Clean"] = df_content['Metadata'].str.lower()
df_content.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content['Metadata'].str.lower()


Unnamed: 0,Place_Id,Place_Name,Metadata,Metadata_Clean
0,1,Ekowisata Mangrove Wonorejo,Ekowisata Mangrove Wonorejo Hutan Wisata Mangr...,ekowisata mangrove wonorejo hutan wisata mangr...
1,2,Taman Harmoni Keputih,Taman Harmoni Keputih Tempat tersebut ialah Ta...,taman harmoni keputih tempat tersebut ialah ta...
2,3,Air Mancur Menari,Air Mancur Menari Jembatan Kenjeran dengan air...,air mancur menari jembatan kenjeran dengan air...
3,4,Taman Prestasi,Taman Prestasi Taman Prestasi Surabaya merupak...,taman prestasi taman prestasi surabaya merupak...
4,5,Monumen Kapal Selam,"Monumen Kapal Selam Monumen Kapal Selam, atau ...","monumen kapal selam monumen kapal selam, atau ..."


### *Remove Punctuation*

In [None]:
word_mapping = {
    'hatmoni': 'harmoni',
    'thp' : 'terhadap',
    'kaltim' : 'kalimantan timur',
    'kri' : 'kapal perang republik indonesia',
    'kkm' : 'kepala kamar mesin',
    'ksal' : 'kepala staf angkatan laut',
    'kbs' : 'kebun binatang surabaya',
    'arekan' : 'arek',
    'no' : 'nomor',
    'rt' : 'rukun tetangga',
    'rw' : 'rukun warga',
    'hoo' : 'ho',
    'm' : 'meter',
    'sespuh' : 'sesepuh',
    'piti' : 'persatuan islam tionghoa indonesia',
    'areal' : 'area',
    'h' : 'haji',
    'kh' : 'kyai haji',
    'tni' : 'tentara nasional indonesia',
    'kalimas' : 'kali mas',
    'paku buwono' : 'pakubowono',
    'tri dharma' : 'tridharma',
    'klenteng' : 'kelenteng',
    'dr' : 'doctor',
    'mph' : 'magister kesehatan masyarakat',
    'puslitbang' : 'pusat penelitian pengembangan',
    'yantekkes' : 'pelayanan teknologi kesehatan',
    'tehnologi' : 'teknologi',
    'jl' : 'jalan',
    'monjaya' : 'monumen jalesveva jayamahe',
    'pdu' : 'pakaian dinas upacara',
    'suro' : 'sura',
    'boyo' : 'baya',
    'propinsi' : 'provinsi',
    'kenpark' : 'kenjeran park',
    'rp' : 'rupiah',
    'kuatir' : 'khawatir',
    'penpres' : 'penetapan presiden',
    'muri' : 'museum rekor dunia indonesia',
    'ken' : 'kenjeran',
    'thr' : 'taman hiburan rakyat',
    'trs' : 'taman remaja surabaya',
    'hi tech' : 'hi-tech',
    'november' : 'nopember',
    'pemkot' : 'pemerintah kota',
    'sma' : 'sekolah menengah atas',
    'ri' : 'republik indonesia'
}

In [None]:
def cleaning_text(text):
    text = text.replace('.', ' ')
    text = text.replace('\n', ' ') #replace new line into space
    text = text.replace('-', ' ')
    text = text.translate(str.maketrans(' ', ' ', string.punctuation)) #remove all punctuations
    text = text.replace('â', ' ')
    text = text.replace('€', ' ')
    text = text.replace('œ', ' ')
    text = text.replace('“', ' ')
    text = text.replace('ã—', ' ')
    text = text.replace('š', ' ')
    text = re.sub(r'[0-9]+', '', text)
    text = text.strip() #remove characters space from both left right text
    return text

def remove_multispace(text):
    return re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space

def replace_word(text):
    words = text.split()
    replaced_words = [word_mapping[word] if word in word_mapping else word for word in words]
    return ' '.join(replaced_words)

df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(cleaning_text)
df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(remove_multispace)
df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(replace_word)
df_content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(cleaning_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(remove_multispace)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_conte

Unnamed: 0,Place_Id,Place_Name,Metadata,Metadata_Clean
0,1,Ekowisata Mangrove Wonorejo,Ekowisata Mangrove Wonorejo Hutan Wisata Mangr...,ekowisata mangrove wonorejo hutan wisata mangr...
1,2,Taman Harmoni Keputih,Taman Harmoni Keputih Tempat tersebut ialah Ta...,taman harmoni keputih tempat tersebut ialah ta...
2,3,Air Mancur Menari,Air Mancur Menari Jembatan Kenjeran dengan air...,air mancur menari jembatan kenjeran dengan air...
3,4,Taman Prestasi,Taman Prestasi Taman Prestasi Surabaya merupak...,taman prestasi taman prestasi surabaya merupak...
4,5,Monumen Kapal Selam,"Monumen Kapal Selam Monumen Kapal Selam, atau ...",monumen kapal selam monumen kapal selam atau d...
5,6,Taman Kunang-Kunang,Taman Kunang-Kunang Taman Kunang â€“ Kunang di...,taman kunang kunang taman kunang kunang di sur...
6,7,Taman Buah Surabaya,Taman Buah Surabaya Wisata Taman Buah Undaan d...,taman buah surabaya wisata taman buah undaan d...
7,8,Taman Pelangi,Taman Pelangi Kalau pelangi biasanya ada di si...,taman pelangi kalau pelangi biasanya ada di si...
8,10,Taman Keputran,"Taman Keputran Ntah, mengapa nama taman ini di...",taman keputran ntah mengapa nama taman ini dis...
9,11,Food Junction Grand Pakuwon,Food Junction Grand Pakuwon Food Junction Gran...,food junction grand pakuwon food junction gran...


### Tokenizing

In [None]:
pip install nltk



In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def tokenizing(text):
 return word_tokenize(text)
df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(tokenizing)
df_content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(tokenizing)


Unnamed: 0,Place_Id,Place_Name,Metadata,Metadata_Clean
0,1,Ekowisata Mangrove Wonorejo,Ekowisata Mangrove Wonorejo Hutan Wisata Mangr...,"[ekowisata, mangrove, wonorejo, hutan, wisata,..."
1,2,Taman Harmoni Keputih,Taman Harmoni Keputih Tempat tersebut ialah Ta...,"[taman, harmoni, keputih, tempat, tersebut, ia..."
2,3,Air Mancur Menari,Air Mancur Menari Jembatan Kenjeran dengan air...,"[air, mancur, menari, jembatan, kenjeran, deng..."
3,4,Taman Prestasi,Taman Prestasi Taman Prestasi Surabaya merupak...,"[taman, prestasi, taman, prestasi, surabaya, m..."
4,5,Monumen Kapal Selam,"Monumen Kapal Selam Monumen Kapal Selam, atau ...","[monumen, kapal, selam, monumen, kapal, selam,..."
5,6,Taman Kunang-Kunang,Taman Kunang-Kunang Taman Kunang â€“ Kunang di...,"[taman, kunang, kunang, taman, kunang, kunang,..."
6,7,Taman Buah Surabaya,Taman Buah Surabaya Wisata Taman Buah Undaan d...,"[taman, buah, surabaya, wisata, taman, buah, u..."
7,8,Taman Pelangi,Taman Pelangi Kalau pelangi biasanya ada di si...,"[taman, pelangi, kalau, pelangi, biasanya, ada..."
8,10,Taman Keputran,"Taman Keputran Ntah, mengapa nama taman ini di...","[taman, keputran, ntah, mengapa, nama, taman, ..."
9,11,Food Junction Grand Pakuwon,Food Junction Grand Pakuwon Food Junction Gran...,"[food, junction, grand, pakuwon, food, junctio..."


### Filtering

In [None]:
f = open("/content/stopwords-id.txt", "r")
stopword_list = []
for line in f:
  stripped_line = line.strip()
  line_list = stripped_line.split()
  stopword_list.append(line_list[0])
f.close()

stopword_list = set(stopword_list) # convert list to dictionary
def stopwords_removal(text):
  return [word for word in text if word not in stopword_list]
df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(stopwords_removal)
df_content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(stopwords_removal)


Unnamed: 0,Place_Id,Place_Name,Metadata,Metadata_Clean
0,1,Ekowisata Mangrove Wonorejo,Ekowisata Mangrove Wonorejo Hutan Wisata Mangr...,"[ekowisata, mangrove, wonorejo, hutan, wisata,..."
1,2,Taman Harmoni Keputih,Taman Harmoni Keputih Tempat tersebut ialah Ta...,"[taman, harmoni, keputih, taman, harmoni, kepu..."
2,3,Air Mancur Menari,Air Mancur Menari Jembatan Kenjeran dengan air...,"[air, mancur, menari, jembatan, kenjeran, air,..."
3,4,Taman Prestasi,Taman Prestasi Taman Prestasi Surabaya merupak...,"[taman, prestasi, taman, prestasi, surabaya, t..."
4,5,Monumen Kapal Selam,"Monumen Kapal Selam Monumen Kapal Selam, atau ...","[monumen, kapal, selam, monumen, kapal, selam,..."
5,6,Taman Kunang-Kunang,Taman Kunang-Kunang Taman Kunang â€“ Kunang di...,"[taman, kunang, kunang, taman, kunang, kunang,..."
6,7,Taman Buah Surabaya,Taman Buah Surabaya Wisata Taman Buah Undaan d...,"[taman, buah, surabaya, wisata, taman, buah, u..."
7,8,Taman Pelangi,Taman Pelangi Kalau pelangi biasanya ada di si...,"[taman, pelangi, pelangi, siang, pasca, hujan,..."
8,10,Taman Keputran,"Taman Keputran Ntah, mengapa nama taman ini di...","[taman, keputran, nama, taman, taman, keputran..."
9,11,Food Junction Grand Pakuwon,Food Junction Grand Pakuwon Food Junction Gran...,"[food, junction, grand, pakuwon, food, junctio..."


### Stemming

In [None]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
pip install swifter

Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m44.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dask-expr<1.2,>=1.1 (from dask[dataframe]>=2.10.0->swifter)
  Downloading dask_expr-1.1.9-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.9-py3-none-any.whl (241 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.9/241.9 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16505 sha256=d674e

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
  return stemmer.stem(term)
term_dict = {}
for document in df_content["Metadata_Clean"]:
  for term in document:
    if term not in term_dict:
      term_dict[term] = ' '
for term in term_dict:
  term_dict[term] = stemmed_wrapper(term)
def get_stemmed_term(document):
  return [term_dict[term] for term in document]
df_content["Metadata_Clean"] = df_content["Metadata_Clean"].swifter.apply(get_stemmed_term)
df_content.tail()

Pandas Apply:   0%|          | 0/38 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content["Metadata_Clean"].swifter.apply(get_stemmed_term)


Unnamed: 0,Place_Id,Place_Name,Metadata,Metadata_Clean
33,40,Taman Hiburan Rakyat,Taman Hiburan Rakyat Taman Hiburan Rakyat atau...,"[taman, hibur, rakyat, taman, hibur, rakyat, t..."
34,41,Taman Mundu,Taman Mundu Taman Mundu merupakan salah satu t...,"[taman, mundu, taman, mundu, taman, ruang, buk..."
35,42,Museum Mpu Tantular,Museum Mpu Tantular Museum Negeri Mpu Tantular...,"[museum, mpu, tantular, museum, negeri, mpu, t..."
36,45,Taman Flora Bratang Surabaya,Taman Flora Bratang Surabaya Taman Flora adala...,"[taman, flora, bratang, surabaya, taman, flora..."
37,46,Gereja Perawan Maria Tak Berdosa Surabaya,Gereja Perawan Maria Tak Berdosa Surabaya Gere...,"[gereja, perawan, maria, dosa, surabaya, gerej..."


In [None]:
f = open("/content/stopwords-id.txt", "r")
stopword_list = []
for line in f:
  stripped_line = line.strip()
  line_list = stripped_line.split()
  stopword_list.append(line_list[0])
f.close()

stopword_list = set(stopword_list) # convert list to dictionary
def stopwords_removal(text):
  return [word for word in text if word not in stopword_list]
df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(stopwords_removal)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_content["Metadata_Clean"] = df_content["Metadata_Clean"].apply(stopwords_removal)


In [None]:
df_content

Unnamed: 0,Place_Id,Place_Name,Metadata,Metadata_Clean
0,1,Ekowisata Mangrove Wonorejo,Ekowisata Mangrove Wonorejo Hutan Wisata Mangr...,"[ekowisata, mangrove, wonorejo, hutan, wisata,..."
1,2,Taman Harmoni Keputih,Taman Harmoni Keputih Tempat tersebut ialah Ta...,"[taman, harmoni, putih, taman, harmoni, putih,..."
2,3,Air Mancur Menari,Air Mancur Menari Jembatan Kenjeran dengan air...,"[air, mancur, tari, jembatan, kenjeran, air, m..."
3,4,Taman Prestasi,Taman Prestasi Taman Prestasi Surabaya merupak...,"[taman, prestasi, taman, prestasi, surabaya, t..."
4,5,Monumen Kapal Selam,"Monumen Kapal Selam Monumen Kapal Selam, atau ...","[monumen, kapal, selam, monumen, kapal, selam,..."
5,6,Taman Kunang-Kunang,Taman Kunang-Kunang Taman Kunang â€“ Kunang di...,"[taman, nang, nang, taman, nang, nang, surabay..."
6,7,Taman Buah Surabaya,Taman Buah Surabaya Wisata Taman Buah Undaan d...,"[taman, buah, surabaya, wisata, taman, buah, u..."
7,8,Taman Pelangi,Taman Pelangi Kalau pelangi biasanya ada di si...,"[taman, pelangi, pelangi, siang, pasca, hujan,..."
8,10,Taman Keputran,"Taman Keputran Ntah, mengapa nama taman ini di...","[taman, keputran, nama, taman, taman, keputran..."
9,11,Food Junction Grand Pakuwon,Food Junction Grand Pakuwon Food Junction Gran...,"[food, junction, grand, pakuwon, food, junctio..."


## Metode CBF

### Matriks Item-Features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def dummy(tokens):
  return tokens
vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy)
tfidf = vectorizer.fit_transform(df_content["Metadata_Clean"].values)
features1 = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())

itemft = features1.copy()
itemft[itemft != 0] = 1
itemft = itemft.drop(itemft.columns[0], axis=1)



In [None]:
itemft.columns

Index(['abadi', 'adem', 'adil', 'agama', 'agung', 'ahmad', 'air', 'ajak',
       'akabri', 'akibat',
       ...
       'wilayah', 'wisata', 'wisatawan', 'wonorejo', 'wujud', 'yani',
       'yayasan', 'yogyakarta', 'zaman', 'zeven'],
      dtype='object', length=899)

In [None]:
pd.set_option('display.max_columns', 20)
itemft

Unnamed: 0,abadi,adem,adil,agama,agung,ahmad,air,ajak,akabri,akibat,...,wilayah,wisata,wisatawan,wonorejo,wujud,yani,yayasan,yogyakarta,zaman,zeven
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Matriks User-Item

In [None]:
dtf_users=pd.read_excel("/content/data drop kolom.xlsx")

In [None]:
# content_merge = rating.merge(df_content, on = ["Place_Id"])
# content_merge = content_merge.drop(columns=["Place_Name"])

# tmp = content_merge.copy()
# dtf_users = tmp.pivot_table(index="User_Id", columns="Place_Id", values="Place_Ratings").fillna(0)

In [None]:
dtf_users

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,...,34,35,36,37,39,40,41,42,45,46
0,5,5,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,5,5,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,3,0,4,0,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,0,0,0,0,0,3,0,0,4,0,...,0,0,0,5,0,0,0,0,0,0
284,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,5,0,2,0
285,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
286,0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dtf_users = pd.DataFrame(dtf_users)
dtf_users.columns = range(1,39)
dtf_users.index = range(1,289)
dtf_users

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,29,30,31,32,33,34,35,36,37,38
1,5,5,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,3,5,5,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,3,0,4,0,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0,0,0,0,0,3,0,0,4,0,...,0,0,0,5,0,0,0,0,0,0
285,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,5,0,2,0
286,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
287,0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Normalisasi

In [None]:
# Normalized
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(dtf_users)
dtf_users_transform = scaler.transform(dtf_users)
dtf_users_transform

array([[1. , 1. , 0.6, ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0.2, 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [None]:
dtf_users_transform = pd.DataFrame(dtf_users_transform)
dtf_users_transform.columns = range(1,39)
dtf_users_transform.index = range(1,289)
dtf_users_transform

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,29,30,31,32,33,34,35,36,37,38
1,1.0,1.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.6,1.0,1.0,0.8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.8,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.8,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
285,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.4,0.0
286,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
287,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Matriks Predict Rating

In [None]:
#Split Data
split = int(0.70*dtf_users_transform.shape[1])
#Train
dtf_train = dtf_users_transform.loc[:,:split-1]
#Testing
dtf_test = dtf_users_transform.loc[:, split:]

In [None]:
# # Acak kolom
# shuffled_columns = dtf_users_transform.columns.to_list()
# np.random.seed(42)  # Untuk mendapatkan hasil yang sama setiap kali
# np.random.shuffle(shuffled_columns)

# # Split kolom menjadi 70% untuk train dan 30% untuk test
# split = int(0.80 * len(shuffled_columns))
# train_columns = shuffled_columns[:split]
# test_columns = shuffled_columns[split:]

# # Bagi DataFrame berdasarkan kolom yang diacak
# dtf_train = dtf_users_transform[train_columns]
# dtf_test = dtf_users_transform[test_columns]

In [None]:
dtf_train

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
1,1.0,1.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.6,1.0,1.0,0.8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.8,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
285,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.4,0.0,1.0,0.8,0.0,0.0,0.0
286,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.6,0.0,0.0,0.4,0.0,0.0,0.0,0.0


In [None]:
matrix_rating = pd.DataFrame()
for user_id in list(range(287,-1,-1)):
    # Split train test
    train = dtf_train.iloc[user_id].to_frame(name="Place_Rating")
    test = dtf_test.iloc[user_id].to_frame(name="Place_Rating")

    tmp = test.copy()
    tmp["Place_Rating"] = np.nan
    train = pd.concat([train, tmp], ignore_index=True)
    # train = train.append(tmp)

    #user profile/user features
    user = train[["Place_Rating"]].fillna(0).values.T
    user = pd.DataFrame(user)
    item = itemft.copy()
    item = pd.DataFrame(item)
    #user_ft(users,fatures) = user(users,item) x item(item,features)
    user_ft = pd.DataFrame(np.dot(user, item))

    #weight = usr_ft / numer of rated item
    weights = user_ft / len(train[~train["Place_Rating"].isna()])                                                                                                                                                  * 4.375

    #PREDICT
    # predicted rating(users,item) = sum features weights(users,fatures) each item / numer of feature (users,fatures) that appear item
    # number of features of each item
    ftcount = []
    for i in range(0,len(item)):
        count = item.iloc[i] > 0
        ftcount.append(count.sum())
    ftcount = pd.DataFrame(ftcount)

    feature_in_item = []
    for i in range(38):
        for j in range(899):
            if item.iloc[i][j] == 0 :
                continue
            feature_in_item.append(
                {
                    'i':i,
                    'j':j
                }
            )
    feature_in_item = pd.DataFrame(feature_in_item)
    j = pd.DataFrame(feature_in_item["j"])

    weight_new = []
    for i in feature_in_item["j"]:
        y=weights[i]
        weight_new.append(y.values)
    weight_new=pd.DataFrame(weight_new)

    Predict = pd.concat([feature_in_item,weight_new], axis=1)
    Predict = Predict.rename(index=str, columns={0 :"Weight"})
    weight = pd.DataFrame(Predict["Weight"])

    Predict = Predict.groupby('i')['Weight'].sum()

    Predict = pd.DataFrame(Predict)
    Predict = pd.concat([Predict,ftcount], axis=1)
    Predict = Predict.rename(index=str, columns={0 :"count ft item"})

    Predict["Predict_Rating"] = Predict["Weight"] / Predict["count ft item"]

    user1 = pd.DataFrame(Predict["Predict_Rating"])
    # user1 = Predict
    user1 = user1.reset_index().T

    matrix_rating = pd.concat([user1,matrix_rating])
matrix_rating_new = matrix_rating.drop(labels=["index"])
matrix_rating_new = np.array(matrix_rating_new)

In [None]:
matrix_rating_new.max()

0.3088235294117648

In [None]:
matrix_rating_new=pd.DataFrame(matrix_rating_new)
matrix_rating_new.columns = range(1,39)
matrix_rating_new.index = range(1,289)
matrix_rating_new

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,29,30,31,32,33,34,35,36,37,38
1,0.232581,0.212333,0.137971,0.068654,0.03,0.068444,0.080405,0.056,0.055851,0.057037,...,0.085556,0.022727,0.010426,0.057791,0.064776,0.0525,0.089833,0.031111,0.128333,0.073043
2,0.100484,0.056778,0.076594,0.218077,0.198636,0.231,0.213784,0.063,0.069255,0.098519,...,0.089444,0.059545,0.04766,0.091977,0.080448,0.13125,0.154,0.057037,0.141944,0.124783
3,0.006774,0.007,0.007609,0.012115,0.002727,0.004667,0.011351,0.105,0.013404,0.0,...,0.011667,0.002727,0.002234,0.004884,0.007836,0.013125,0.0175,0.0,0.023333,0.0
4,0.016935,0.009333,0.015217,0.020192,0.006818,0.011667,0.019865,0.0252,0.105,0.011667,...,0.011667,0.006818,0.0,0.012209,0.009403,0.019688,0.021,0.007778,0.0175,0.013696
5,0.162581,0.119778,0.128333,0.212692,0.063182,0.248889,0.192973,0.259,0.11617,0.274815,...,0.195417,0.065,0.065532,0.112326,0.155149,0.242813,0.253167,0.075185,0.2625,0.164348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.042903,0.024111,0.037029,0.059231,0.014545,0.120556,0.046351,0.042,0.15117,0.035,...,0.041806,0.02,0.008936,0.028488,0.031343,0.059063,0.077,0.025926,0.058333,0.045652
285,0.051935,0.039667,0.043623,0.078077,0.176818,0.064556,0.070946,0.0322,0.046915,0.071296,...,0.050556,0.061364,0.044681,0.078953,0.062687,0.085313,0.068833,0.066111,0.060278,0.108043
286,0.020323,0.007,0.010652,0.016154,0.105,0.009333,0.014189,0.0084,0.01117,0.023333,...,0.00875,0.015,0.01117,0.021977,0.012537,0.013125,0.014,0.007778,0.0175,0.018261
287,0.080161,0.045111,0.055797,0.100962,0.026364,0.216222,0.07473,0.042,0.040957,0.063519,...,0.098194,0.035909,0.024574,0.051279,0.062687,0.115938,0.133,0.040185,0.105,0.092826


In [None]:
# denormalisasi
data_min=dtf_users.min(axis=0)
data_max=dtf_users.max(axis=0)

d_matrix_rating_new=matrix_rating_new*(data_max-data_min)+data_min
# d_matrix_rating_new=d_matrix_rating_new*(data_max-data_min)+data_min
d_matrix_rating_new

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,29,30,31,32,33,34,35,36,37,38
1,1.162903,1.061667,0.689855,0.343269,0.15,0.342222,0.402027,0.28,0.279255,0.285185,...,0.427778,0.113636,0.052128,0.288953,0.323881,0.2625,0.449167,0.155556,0.641667,0.365217
2,0.502419,0.283889,0.382971,1.090385,0.993182,1.155,1.068919,0.315,0.346277,0.492593,...,0.447222,0.297727,0.238298,0.459884,0.402239,0.65625,0.77,0.285185,0.709722,0.623913
3,0.033871,0.035,0.038043,0.060577,0.013636,0.023333,0.056757,0.525,0.067021,0.0,...,0.058333,0.013636,0.01117,0.024419,0.039179,0.065625,0.0875,0.0,0.116667,0.0
4,0.084677,0.046667,0.076087,0.100962,0.034091,0.058333,0.099324,0.126,0.525,0.058333,...,0.058333,0.034091,0.0,0.061047,0.047015,0.098438,0.105,0.038889,0.0875,0.068478
5,0.812903,0.598889,0.641667,1.063462,0.315909,1.244444,0.964865,1.295,0.580851,1.374074,...,0.977083,0.325,0.32766,0.561628,0.775746,1.214063,1.265833,0.375926,1.3125,0.821739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.214516,0.120556,0.185145,0.296154,0.072727,0.602778,0.231757,0.21,0.755851,0.175,...,0.209028,0.1,0.044681,0.142442,0.156716,0.295313,0.385,0.12963,0.291667,0.228261
285,0.259677,0.198333,0.218116,0.390385,0.884091,0.322778,0.35473,0.161,0.234574,0.356481,...,0.252778,0.306818,0.223404,0.394767,0.313433,0.426563,0.344167,0.330556,0.301389,0.540217
286,0.101613,0.035,0.053261,0.080769,0.525,0.046667,0.070946,0.042,0.055851,0.116667,...,0.04375,0.075,0.055851,0.109884,0.062687,0.065625,0.07,0.038889,0.0875,0.091304
287,0.400806,0.225556,0.278986,0.504808,0.131818,1.081111,0.373649,0.21,0.204787,0.317593,...,0.490972,0.179545,0.122872,0.256395,0.313433,0.579688,0.665,0.200926,0.525,0.46413


In [None]:
d_matrix_rating_new.max()

Unnamed: 0,0
1,1.394355
2,1.061667
3,1.278261
4,1.117308
5,1.0
6,1.337778
7,1.253378
8,1.295
9,1.020213
10,1.374074


#### Hasil Predict

In [None]:
# Hasil Predict
def rescale_data(data, old_min, old_max, new_min, new_max):
    return ((data - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

# Tentukan nilai minimum dan maksimum dari rentang asli dan rentang baru
old_min = 0
old_max = 1.544118
new_min = 0
new_max = 5
pd.set_option('display.max_columns', None)
# Ubah rentang nilai
d_matrix = d_matrix_rating_new.apply(rescale_data, old_min=old_min, old_max=old_max, new_min=new_min, new_max=new_max)
d_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
1,3.765591,3.437777,2.233816,1.111538,0.485714,1.108148,1.301802,0.906666,0.904255,0.923457,1.200793,0.891034,1.1,1.363541,1.060684,0.779166,0.588461,1.570476,0.406397,0.124859,0.462778,0.449425,0.708333,0.751701,0.335802,0.309091,0.8,1.248437,1.385185,0.367965,0.168794,0.935659,1.048756,0.85,1.454444,0.503704,2.077777,1.182608
2,1.626881,0.919259,1.240096,3.530768,3.216017,3.739999,3.46126,1.02,1.121276,1.595061,1.915873,1.148965,1.616666,1.4875,1.423931,1.0625,0.904487,1.813333,0.881481,0.672316,0.963333,1.309195,1.204166,1.202721,1.238271,1.308485,1.533333,1.469791,1.448148,0.964069,0.771631,1.489147,1.302487,2.125,2.493333,0.923457,2.298148,2.020289
3,0.109677,0.113333,0.123188,0.196154,0.044156,0.075556,0.183784,1.7,0.217021,0.0,0.202381,0.117241,0.15,0.265625,0.130769,0.0,0.065385,0.194286,0.051515,0.0,0.028333,0.058621,0.0,0.069388,0.0,0.0,0.0,0.159375,0.188889,0.044156,0.03617,0.07907,0.126866,0.2125,0.283333,0.0,0.377778,0.0
4,0.274193,0.151111,0.246377,0.326923,0.11039,0.188889,0.321622,0.408,1.7,0.188889,0.323809,0.164138,0.15,0.31875,0.174359,0.141667,0.130769,0.194286,0.085859,0.057627,0.141667,0.117241,0.159375,0.104082,0.062963,0.030909,0.2,0.10625,0.188889,0.11039,0.0,0.197674,0.152239,0.31875,0.34,0.125926,0.283333,0.221739
5,2.632257,1.939259,2.077777,3.443589,1.022943,4.029629,3.124324,4.193332,1.880851,4.449382,3.912698,3.626666,4.999999,4.303124,3.763247,1.440277,1.264102,2.881904,1.20202,0.393785,1.105,1.133333,1.345833,1.873469,1.07037,1.287878,2.733333,2.567708,3.163888,1.052381,1.060993,1.818604,2.51194,3.931249,4.098888,1.217284,4.249999,2.660869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0.694623,0.39037,0.599517,0.958974,0.235498,1.951851,0.75045,0.68,2.447517,0.566667,0.91746,0.476782,0.45,0.74375,0.581196,0.401389,0.370513,0.69619,0.372054,0.10565,0.330555,0.332184,0.53125,0.381633,0.304321,0.257576,0.466667,0.486979,0.676852,0.323809,0.144681,0.46124,0.507463,0.95625,1.246666,0.419753,0.944444,0.73913
285,0.84086,0.642222,0.70628,1.264102,2.86277,1.045185,1.148648,0.521333,0.759574,1.154321,0.890476,0.726896,0.866666,0.672917,0.813675,1.086111,0.751923,0.663809,1.717171,0.979661,3.513333,3.399999,1.186458,1.040816,1.175308,1.205454,1.566666,0.903125,0.818518,0.993506,0.723404,1.278294,1.014925,1.38125,1.114444,1.07037,0.975926,1.749275
286,0.329032,0.113333,0.172464,0.261538,1.7,0.151111,0.22973,0.136,0.180851,0.377778,0.202381,0.14069,0.25,0.159375,0.174359,0.2125,0.228846,0.194286,0.10303,0.288136,0.255,0.351724,0.2125,0.277551,0.314815,0.34,0.3,0.185937,0.141667,0.242857,0.180851,0.355814,0.202985,0.2125,0.226667,0.125926,0.283333,0.295652
287,1.297849,0.73037,0.903381,1.634615,0.42684,3.50074,1.20991,0.68,0.66312,1.028395,1.470635,0.812873,1.033333,0.95625,2.048717,0.731944,0.544872,2.622857,0.715488,0.268926,1.473333,0.644827,0.885416,0.77483,0.661111,0.669697,0.866666,1.080208,1.589814,0.581385,0.397872,0.830232,1.014925,1.877083,2.153333,0.650617,1.7,1.502898


In [None]:
d_matrix.max()

Unnamed: 0,0
1,4.515053
2,3.437777
3,4.139129
4,3.617948
5,3.238094
6,4.331851
7,4.058558
8,4.193332
9,3.303545
10,4.449382


### Evaluasi Model

#### Split 80%

In [None]:
pred_train = pd.DataFrame(d_matrix).loc[:,:split-1]
dtf_train = dtf_users.fillna(0).loc[:,:split-1]
pred = np.array(pred_train)
trainset = np.array(dtf_train)
total = 0
jum = 0
for i in range(len(trainset)):
    for j in range(len(trainset[i])):
        if trainset[i][j] != 0:
            total += (pred[i][j] - trainset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  1.6143948975759592
RMSE Training =  1.2705884060449943


In [None]:
pred_test = pd.DataFrame(d_matrix).loc[:, split:]
dtf_test = dtf_users.fillna(0).loc[:, split:]
pred = np.array(pred_test)
testset = np.array(dtf_test)
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Testing = ', mse)
rmse = mse**0.5
print('RMSE Testing = ', rmse)

MSE Testing =  8.907331625224158
RMSE Testing =  2.9845153082576337


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(d_matrix)
actual_ratings = np.array(dtf_users)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[10356    27]
 [  321   240]]
Accuracy: 0.9682017543859649
Precision: 0.898876404494382
Recall: 0.42780748663101603
F1-score: 0.5797101449275363


#### Split 75%

In [None]:
pred_train = pd.DataFrame(d_matrix).loc[:,:split-1]
dtf_train = dtf_users.fillna(0).loc[:,:split-1]
pred = np.array(pred_train)
trainset = np.array(dtf_train)
total = 0
jum = 0
for i in range(len(trainset)):
    for j in range(len(trainset[i])):
        if trainset[i][j] != 0:
            total += (pred[i][j] - trainset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  1.0155487897815973
RMSE Training =  1.0077444069711314


In [None]:
pred_test = pd.DataFrame(d_matrix).loc[:, split:]
dtf_test = dtf_users.fillna(0).loc[:, split:]
pred = np.array(pred_test)
testset = np.array(dtf_test)
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Testing = ', mse)
rmse = mse**0.5
print('RMSE Testing = ', rmse)

MSE Testing =  8.488211579548585
RMSE Testing =  2.9134535485482833


#### Split 70%

In [None]:
pred_train = pd.DataFrame(d_matrix).loc[:,:split-1]
dtf_train = dtf_users.fillna(0).loc[:,:split-1]
pred = np.array(pred_train)
trainset = np.array(dtf_train)
total = 0
jum = 0
for i in range(len(trainset)):
    for j in range(len(trainset[i])):
        if trainset[i][j] != 0:
            total += (pred[i][j] - trainset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  1.5583652734190878
RMSE Training =  1.2483450137758743


In [None]:
pred_test = pd.DataFrame(d_matrix).loc[:, split:]
dtf_test = dtf_users.fillna(0).loc[:, split:]
pred = np.array(pred_test)
testset = np.array(dtf_test)
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Testing = ', mse)
rmse = mse**0.5
print('RMSE Testing = ', rmse)

MSE Testing =  8.59077820832724
RMSE Testing =  2.931002935571242


#### Split 65%

In [None]:
pred_train = pd.DataFrame(d_matrix).loc[:,:split-1]
dtf_train = dtf_users.fillna(0).loc[:,:split-1]
pred = np.array(pred_train)
trainset = np.array(dtf_train)
total = 0
jum = 0
for i in range(len(trainset)):
    for j in range(len(trainset[i])):
        if trainset[i][j] != 0:
            total += (pred[i][j] - trainset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  1.5858040958438708
RMSE Training =  1.2592871379649166


In [None]:
pred_test = pd.DataFrame(d_matrix).loc[:, split:]
dtf_test = dtf_users.fillna(0).loc[:, split:]
pred = np.array(pred_test)
testset = np.array(dtf_test)
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Testing = ', mse)
rmse = mse**0.5
print('RMSE Testing = ', rmse)

MSE Testing =  9.021866918229465
RMSE Testing =  3.003642275343298


#### Split 60%

In [None]:
pred_train = pd.DataFrame(d_matrix).loc[:,:split-1]
dtf_train = dtf_users.fillna(0).loc[:,:split-1]
pred = np.array(pred_train)
trainset = np.array(dtf_train)
total = 0
jum = 0
for i in range(len(trainset)):
    for j in range(len(trainset[i])):
        if trainset[i][j] != 0:
            total += (pred[i][j] - trainset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  1.5931025962199932
RMSE Training =  1.2621816811457822


In [None]:
pred_test = pd.DataFrame(d_matrix).loc[:, split:]
dtf_test = dtf_users.fillna(0).loc[:, split:]
pred = np.array(pred_test)
testset = np.array(dtf_test)
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1
mse = total / jum
print('MSE Testing = ', mse)
rmse = mse**0.5
print('RMSE Testing = ', rmse)

MSE Testing =  9.449166894644994
RMSE Testing =  3.0739497222051297


#### Confusion Matrix

In [None]:
#Split Data
split = int(0.70*dtf_users.shape[1])
#Train
dtf_train = dtf_users.loc[:,:split-1]
#Testing
dtf_test = dtf_users.loc[:, split:]

In [None]:
pd.
dtf_test

Unnamed: 0,26,27,28,29,30,31,32,33,34,35,36,37,38
1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,0,0,0,0,0,0,5,0,0,0,0,0,0
285,5,0,0,0,0,0,0,0,0,5,0,2,0
286,0,0,0,0,0,0,0,0,0,0,0,1,0
287,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
dtf_test_df = dtf_test.reset_index()
original_format = dtf_test_df.melt(id_vars=['index'], var_name='Place_Id', value_name='Place_Ratings')

# Rename 'index' kembali ke 'User_Id'
original_format = original_format.rename(columns={'index': 'User_Id'})

# Filter out rows with ratings of 0 (if necessary)
original_format = original_format[original_format['Place_Ratings'] != 0]
original_format = original_format.reset_index()
original_format

Unnamed: 0,index,User_Id,Place_Id,Place_Ratings
0,17,18,26,4
1,27,28,26,4
2,30,31,26,2
3,46,47,26,2
4,52,53,26,1
5,103,104,26,1
6,106,107,26,1
7,122,123,26,2
8,125,126,26,4
9,126,127,26,4


In [None]:
#Split Data
split = int(0.70*d_matrix.shape[1])
#Train
dm_train = d_matrix.loc[:,:split-1]
#Testing
dm_test = d_matrix.loc[:, split:]

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
dm_test

Unnamed: 0,26,27,28,29,30,31,32,33,34,35,36,37,38
1,0.309091,0.8,1.248437,1.385185,0.367965,0.168794,0.935659,1.048756,0.85,1.454444,0.503704,2.077777,1.182608
2,1.308485,1.533333,1.469791,1.448148,0.964069,0.771631,1.489147,1.302487,2.125,2.493333,0.923457,2.298148,2.020289
3,0.0,0.0,0.159375,0.188889,0.044156,0.03617,0.07907,0.126866,0.2125,0.283333,0.0,0.377778,0.0
4,1.287878,2.733333,2.567708,3.163888,1.052381,1.060993,1.818604,2.51194,3.931249,4.098888,1.217284,4.249999,2.660869
5,1.287878,2.733333,2.567708,3.163888,1.052381,1.060993,1.818604,2.51194,3.931249,4.098888,1.217284,4.249999,2.660869
6,0.391515,1.3,0.575521,0.519444,0.375325,0.349645,0.843411,0.541293,0.991666,0.85,0.335802,0.787037,0.788406
7,0.288485,1.2,1.106771,1.117592,0.485714,0.277305,1.054263,0.879602,1.133333,2.077777,0.503704,1.951851,1.182608
8,1.236363,1.6,1.151041,1.794444,1.000866,0.59078,1.396899,1.395522,2.443749,2.247777,1.259259,1.92037,2.020289
9,0.030909,0.1,0.185937,0.330555,0.044156,0.0,0.158139,0.228358,0.2125,0.226667,0.062963,0.566667,0.147826
10,0.195758,0.566667,0.354167,0.440741,0.301732,0.07234,0.54031,0.40597,0.74375,0.623333,0.35679,0.472222,0.665217


In [None]:
dm_test.max()

Unnamed: 0,0
26,1.370303
27,2.733333
28,2.567708
29,3.163888
30,1.052381
31,1.085106
32,1.818604
33,2.51194
34,3.931249
35,4.098888


#### Split 80%

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_train)
actual_ratings = np.array(dtf_train)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[7900   16]
 [ 196  240]]
Accuracy: 0.9746168582375478
Precision: 0.9375
Recall: 0.5504587155963303
F1-score: 0.6936416184971099


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_test)
actual_ratings = np.array(dtf_test)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[2456   11]
 [ 125    0]]
Accuracy: 0.9475308641975309
Precision: 0.0
Recall: 0.0
F1-score: 0.0


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(d_matrix)
actual_ratings = np.array(dtf_users)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[10356    27]
 [  321   240]]
Accuracy: 0.9682017543859649
Precision: 0.898876404494382
Recall: 0.42780748663101603
F1-score: 0.5797101449275363


#### Split 75%

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_train)
actual_ratings = np.array(dtf_train)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[7346   18]
 [ 112  300]]
Accuracy: 0.9832818930041153
Precision: 0.9433962264150944
Recall: 0.7281553398058253
F1-score: 0.8219178082191781


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_test)
actual_ratings = np.array(dtf_test)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[3000   19]
 [ 148    1]]
Accuracy: 0.9472853535353535
Precision: 0.05
Recall: 0.006711409395973154
F1-score: 0.01183431952662722


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(d_matrix)
actual_ratings = np.array(dtf_users)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[10346    37]
 [  260   301]]
Accuracy: 0.9728618421052632
Precision: 0.8905325443786982
Recall: 0.5365418894830659
F1-score: 0.6696329254727476


#### Split 70%

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_train)
actual_ratings = np.array(dtf_train)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[6802   10]
 [ 169  219]]
Accuracy: 0.9751388888888889
Precision: 0.9563318777292577
Recall: 0.5644329896907216
F1-score: 0.7098865478119935


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_test)
actual_ratings = np.array(dtf_test)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[3560   11]
 [ 172    1]]
Accuracy: 0.9511217948717948
Precision: 0.08333333333333333
Recall: 0.005780346820809248
F1-score: 0.01081081081081081


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(d_matrix)
actual_ratings = np.array(dtf_users)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[10362    21]
 [  341   220]]
Accuracy: 0.966922514619883
Precision: 0.9128630705394191
Recall: 0.39215686274509803
F1-score: 0.5486284289276808


#### Split 65%

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_train)
actual_ratings = np.array(dtf_train)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[6251   10]
 [ 162  201]]
Accuracy: 0.9740338164251208
Precision: 0.95260663507109
Recall: 0.5537190082644629
F1-score: 0.7003484320557491


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_test)
actual_ratings = np.array(dtf_test)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[4111   11]
 [ 197    1]]
Accuracy: 0.9518518518518518
Precision: 0.08333333333333333
Recall: 0.005050505050505051
F1-score: 0.009523809523809525


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(d_matrix)
actual_ratings = np.array(dtf_users)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[10362    21]
 [  359   202]]
Accuracy: 0.9652777777777778
Precision: 0.905829596412556
Recall: 0.3600713012477718
F1-score: 0.5153061224489796


#### Split 60%

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_train)
actual_ratings = np.array(dtf_train)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[5705    9]
 [ 153  181]]
Accuracy: 0.9732142857142857
Precision: 0.9526315789473684
Recall: 0.5419161676646707
F1-score: 0.6908396946564884


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(pred_test)
actual_ratings = np.array(dtf_test)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[4658   11]
 [ 227    0]]
Accuracy: 0.9513888888888888
Precision: 0.0
Recall: 0.0
F1-score: 0.0


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold):
    # Convert actual and predicted ratings to binary labels based on the threshold
    predicted_labels = (predicted_ratings >= threshold).astype(int)
    actual_labels = (actual_ratings >= threshold).astype(int)
    # Flatten the labels and calculate the confusion matrix
    cm = confusion_matrix(actual_labels.flatten(), predicted_labels.flatten())
    accuracy = accuracy_score(actual_labels.flatten(), predicted_labels.flatten())
    precision = precision_score(actual_labels.flatten(), predicted_labels.flatten())
    recall = recall_score(actual_labels.flatten(), predicted_labels.flatten())
    f1 = f1_score(actual_labels.flatten(), predicted_labels.flatten())
    return cm, accuracy, precision, recall, f1

# usage
predicted_ratings = np.array(d_matrix)
actual_ratings = np.array(dtf_users)
threshold = 2.5
cm, accuracy, precision, recall, f1 = calculate_confusion_matrix(actual_ratings, predicted_ratings, threshold)
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Confusion Matrix:
[[10363    20]
 [  380   181]]
Accuracy: 0.9634502923976608
Precision: 0.900497512437811
Recall: 0.3226381461675579
F1-score: 0.4750656167979002


### Hasil Rekomendasi

In [None]:
row_to_sort = d_matrix.iloc[4]
sorted_row = row_to_sort.sort_values(ascending=False)
d_matrix.loc[4, :] = sorted_row
sorted_row

Unnamed: 0,5
14,3.839944
15,3.185731
10,1.921552
34,1.663976
37,1.47909
4,1.444082
35,1.388069
1,1.387335
27,1.285001
11,1.21903


In [None]:
row_to_sort = d_matrix.iloc[4]
sorted_row = row_to_sort.sort_values(ascending=False)
d_matrix.loc[4, :] = sorted_row
sorted_row

Unnamed: 0,5
13,4.999999
10,4.449382
14,4.303124
37,4.249999
8,4.193332
35,4.098888
6,4.029629
34,3.931249
11,3.912698
15,3.763247
