# n-gram 언어모델
단어 문장 대신 아이템 넣어서 평가

In [6]:
#코드 참조:
#데이터 사이언스 스쿨: 확률론적 언어 모형(https://datascienceschool.net/view-notebook/a0c848e1e2d343d685e6077c35c4203b/)
from nltk import bigrams, word_tokenize
from nltk.util import ngrams
import nltk
nltk.download("book", quiet=True)
from nltk.book import *

import pandas as pd
import numpy as np
from nltk import ConditionalFreqDist
import pickle
import random

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [8]:
ratings['liked'] = np.where(ratings['rating']>=4, 1, 0)
ratings['movieId'] = ratings['movieId'].astype('str')
gp_user_like = ratings.groupby(['liked', 'userId'])

In [9]:
# 유저 n이 좋아한 영화 => positive example
# 유저 n이 싫어하는 영화 별로 그룹, 좋아하는 영화 별로 그룹핑
splitted_movies = [gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups]

## 아이템셋 토큰화
- window 사이즈 2인 n-gram 모형
- SS: 문장의 처음
- SE: 문장의 끝

In [47]:
with open('splitted_movies.txt', 'wb') as f:
    pickle.dump(splitted_movies, f)

In [12]:
# 파일 읽어오기
#with open('splitted_movies.txt', 'rb') as f:
 #   splitted_movies = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'splitted_movies.txt'

In [48]:
# 아이템 1개 이하 유저 거르기
for i in range(len(splitted_movies)):
    if len(splitted_movies[i]) < 2:
        print(i)
        print(splitted_movies[i])

In [49]:
sentences = []
for tokens in splitted_movies:
    bigram = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol="SS", right_pad_symbol="SE")
    sentences += [t for t in bigram]

In [50]:
print(sentences[:150])

[('SS', '70'), ('70', '223'), ('223', '296'), ('296', '316'), ('316', '423'), ('423', '500'), ('500', '648'), ('648', '673'), ('673', '736'), ('736', '780'), ('780', '1009'), ('1009', '1030'), ('1030', '1219'), ('1219', '1258'), ('1258', '1377'), ('1377', '1396'), ('1396', '1408'), ('1408', '1445'), ('1445', '1580'), ('1580', '1644'), ('1644', '1676'), ('1676', '2093'), ('2093', '2253'), ('2253', '2338'), ('2338', '2389'), ('2389', '2414'), ('2414', '2528'), ('2528', '2617'), ('2617', '2657'), ('2657', '3176'), ('3176', '3243'), ('3243', '3247'), ('3247', 'SE'), ('SS', '318'), ('318', '8798'), ('8798', '71535'), ('71535', '77455'), ('77455', '91529'), ('91529', '91658'), ('91658', '99114'), ('99114', '109487'), ('109487', '114060'), ('114060', '115713'), ('115713', 'SE'), ('SS', '31'), ('31', '527'), ('527', '647'), ('647', '688'), ('688', '720'), ('720', '914'), ('914', '1093'), ('1093', '1124'), ('1124', '1263'), ('1263', '1272'), ('1272', '1275'), ('1275', '1302'), ('1302', '1371'),

In [51]:
movieId_to_name = pd.Series(movies.title.values, index = movies.movieId.values).to_dict()
name_to_movieId = pd.Series(movies.movieId.values, index = movies.title).to_dict()

In [52]:
cfd = ConditionalFreqDist(sentences)

In [53]:
from nltk.probability import ConditionalProbDist, MLEProbDist
cpd = ConditionalProbDist(cfd, MLEProbDist)

In [54]:
def sentence_score(s):
    p = 0.0
    for i in range(len(s) - 1):
        c = s[i]
        w = s[i + 1]
        p += np.log(cpd[c].prob(w) + np.finfo(float).eps)
    return np.exp(p)

In [55]:
def generate_sentence(seed=None, start_word="SS"):
    if seed is not None:
        import random
        random.seed(seed)
        
    c = start_word
    sentence = []
    sentence.append(c)
    
    while True:
        if c not in cpd:
            break
        w = cpd[c].generate()
        if w == "SE":
            break
        else:
            w2 = w
        sentence.append(w2)
        c = w
    
    moviename = []
    for i in sentence:
        mname = movieId_to_name[int(i)]
        moviename.append(mname)
    
    return sentence, moviename

In [22]:
sentence, movie = generate_sentence(start_word="260")

In [67]:
sentence[0]

'1'

In [63]:
# 모든 유저에 대해 csv만들기
df = pd.DataFrame(columns = ['userId', 'movie_candidates'])
for i in range(len(splitted_movies)):
    sentence, movie = generate_sentence(start_word=splitted_movies[i][0])
    df.loc[i] = [i, sentence]

In [66]:
df.to_csv('ngram_candidates_small.csv')

In [68]:
df

Unnamed: 0,userId,movie_candidates
0,0,"[70, 158, 180, 296, 318, 371, 456, 480, 542, 5..."
1,1,"[318, 353, 356, 428, 429, 432, 434, 442, 500, ..."
2,2,"[31, 32, 36, 58, 95, 165, 260, 318, 348, 356, ..."
3,3,"[21, 32, 34, 47, 253, 260, 292, 333, 344, 350,..."
4,4,"[39, 48, 57, 85, 95, 648, 673, 707, 1500, 1517..."
...,...,...
1208,1208,"[17, 25, 29, 32, 34, 62, 65, 71, 95, 141, 145,..."
1209,1209,"[1, 5, 7, 9, 79, 86, 110, 150, 230, 235, 236, ..."
1210,1210,"[10, 47, 150, 153, 185, 222, 260, 288, 293, 34..."
1211,1211,"[10, 21, 34, 480, 920, 922, 923, 924, 968, 112..."


## 개선해야할 점
- 조건부 확률... 순서는 상관이 없기 때문에 이 점 개선
- 출발 아이템을 frequent한 것으로 바꿔주기

In [24]:
def find_usr_with_id(mid):
    usr = []
    for i in splitted_movies:
        if mid in i:
            usr.append(i)
    
    # 해당 mid를 본 유저와 생성한 sentence를 비교
    compare = [] # 생성한 sentence와 비교해 일치율 구하기
    for u in usr:
        c = list(set(u).intersection(sentence))
        prob = (len(c) / len(sentence)) * 100
        compare.append(prob)
    
    return usr, compare

In [25]:
usr, compare = find_usr_with_id("3")

In [26]:
compare.index(max(compare))

6

In [27]:
compare

[54.54545454545454,
 18.181818181818183,
 0.0,
 18.181818181818183,
 36.36363636363637,
 63.63636363636363,
 72.72727272727273,
 18.181818181818183,
 9.090909090909092,
 54.54545454545454,
 0.0,
 0.0,
 45.45454545454545,
 27.27272727272727,
 0.0,
 0.0,
 9.090909090909092,
 9.090909090909092,
 45.45454545454545,
 18.181818181818183,
 27.27272727272727,
 36.36363636363637,
 36.36363636363637,
 36.36363636363637,
 0.0,
 27.27272727272727,
 27.27272727272727,
 45.45454545454545,
 0.0,
 18.181818181818183,
 36.36363636363637,
 45.45454545454545,
 72.72727272727273,
 54.54545454545454,
 9.090909090909092,
 54.54545454545454,
 36.36363636363637,
 45.45454545454545,
 18.181818181818183,
 18.181818181818183,
 9.090909090909092,
 36.36363636363637,
 18.181818181818183,
 0.0,
 18.181818181818183,
 18.181818181818183,
 0.0,
 54.54545454545454,
 9.090909090909092,
 0.0,
 54.54545454545454,
 27.27272727272727]

In [23]:
# support 실제 얼마나 나오는지
# 출발도 frequent한 아이템을 가지고 real support 구하기
# association rule로 구하는 것 보다 이게 리즈너블 한가?

In [None]:
# 순서 정렬해서 돌리기
# association에서는 실제로 