# Movie Title Generation

In [1]:
import jieba
# import gensim
import pickle
import numpy as np
import os
import json
import pprint as pp
from scipy import spatial
from TranslationTool.langconv import *
from hanziconv import HanziConv
from pycorenlp import StanfordCoreNLP

### Loading Models
**Don't run this block twice!!**

In [2]:
def loading():
    # load stopwords
    with open("stopwords.txt", encoding='utf8') as fp:
        dat = fp.read()
    global stop_words
    stop_words = dat.split('\n')

    # load word2vec model
    global model
    # model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary = True, unicode_errors = 'ignore')
    model = pickle.load(open("model.pkl", "rb"))

    # load idf list
    global idf
    with open('idf.txt', encoding='utf8') as fp:
        dat = fp.read()
    lines = dat.split('\n')
    del lines[-1]
    idf = [(l.split()[0], float(l.split()[1])) for l in lines]
    
    # connect to StanfordCoreNLP
    global nlp
    nlp = StanfordCoreNLP('http://140.113.193.76:9000')
    
    # feature database of evaluation part
    global f1_dict, mv_list_vec, f3_dict
    f1_dict = pickle.load(open("f1_dict.pkl", "rb"))
    mv_list_vec = pickle.load(open("f2_dict.pkl", "rb"))
    f3_dict = pickle.load(open("f3_dict.pkl", "rb"))
    
    return

# global variable with empty value as initialization
model = None
# model_path = "cna_asbc_cbow_d300_w10_n10_hs0_i15.vectors.bin"
nlp = None

stop_words = []
idf = []

f1_dict = {}
mv_list_vec = []
f3_dict = {}

# loading
loading()



### TFIDF Computing

In [3]:
def tfGen(path):
    
    with open(path, encoding='utf8') as fp:
        dat = fp.read()
    
    lines = dat.split('\n')
    del lines[-1]
    
    # segmentation
    segs = []
    for line in lines:
        tmp = [Converter('zh-hant').convert(w) for w in line.split('/')[:-1]]
        segs.extend(tmp)
        # jieba.cut(line, cut_all=False)
    
    # compute tf (word count and frequency)
    words, counts = np.unique(segs, return_counts=True) # default: axis=None
    frequency = counts / len(segs)
    tf = list(zip(words, frequency))
    tf = sorted(tf, key=lambda x: x[1], reverse=True)
    
    return tf

def tfidfGen(tf):
    
    idf_dict = {word: value for word, value in idf}
    
    words = []
    values = []
    for word, value in tf:
        words.append(word)
        values.append(value * idf_dict[word])

    tfidf = list(zip(words, values))
    tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
    
    return tfidf

### Keyword Generation

In [4]:
def keywordExt(tfidf):
    
    global model, stop_words, num_keywords
    
    i = 0
    word_ls = []
    for word, count in tfidf:
        if i == num_keywords:
            break
        if word in stop_words:
            continue
        if word not in model:
            continue
        word_ls.append(word)
        i += 1
    
    return word_ls

def keywordSel(word_ls):
    
    # remove NR
    global nlp
    
    new_word_ls = []
    for word in word_ls:
        output = nlp.annotate(word, properties={
            'annotators': 'pos',
            'outputFormat': 'json'
        })
        pos = output['sentences'][0]['tokens'][0]['pos']
        if pos != 'NR':
            new_word_ls.append((word, pos))
    
    return new_word_ls

### Genre Classification (feature base, which is not used now)
 - feature generation
 - SVM classification model

In [5]:
# global variable with predefined value
num_keywords = 100

def featureGen(word_ls):
    
    # add feature from word2vec
    
    feature = list(np.zeros(300))
    for word in word_ls:
        feature += model.word_vec(word)
    
    return feature

def featureGen2(tfidf):
    
    global word_ls
    
    tfidf_dict = {word: value for word, value in tfidf}
    feature = []
    for word in word_ls:
        if word in tfidf_dict:
            feature.append(tfidf_dict[word])
        else:
            feature.append(0)
    
    
    return feature

### Genre Classification
 - rule base
 - with corresponding to keywords

In [6]:
# global variable with predefined value
# keyword definition: [[keyword, ...], [key alphabet, ...]]
script_keyword = {
    'action': [['計畫', '特工', '殺手'], ['警', '賭']],
    'comedy': [['嘿咻'], ['裸', '妓', '屁']],
    'crime': [['暴力', '受害者', '罪犯', '犯罪'], ['毒']],
    'drama': [[], []],
    'fantasy': [[], ['獸']],
    'horror': [[], ['魔', '怪', '屍']],
    'romance': [[], ['愛']],
    'sci_fi': [['星球', '星際', '太空', '時空', '星艦'], []],
    'war': [['坦克', '地雷'], ['軍', '戰']],
}


def ruleBaseClassify(word_ls):
    # print(word_ls)
    for key, content in script_keyword.items():
        if content[0] == []:
            continue
        for sw in content[0]:
            if sw in word_ls:
                return key
    
    word_ls_str = ''.join(word_ls)
    for key, content in script_keyword.items():
        if content[1] == []:
            continue
        for sw in content[1]:
            if sw in word_ls_str:
                return key
    
    return 'drama'

### Title Candidate Generation

In [7]:
# global variable with predefined value
# rule definition: [POS, SPECIAL_WORD, ReverseOrNot]
special_rule = {
    'action': [['NN', '玩命', True], ['NN', '啟動', False], ['NN', '神鬼', True], ['NN', '遊戲', True]],
    'comedy': [['NN', '行不行', False]],
    'crime': [['NN', '檔案', False], ['NN', '風暴', False], ['NN', '風雲', False]],
    'drama': [],
    'fantasy': [],
    'horror': [['NN', '絕命', True], ['NN', '失控', True], ['VV', '鬼', True]],
    'romance': [['NN', '真愛', True]],
    'sci_fi': [['NN', '星際', True], ['NN', '世界', False]],
    'war': [['NN', '重生', False]],
}

def block(s1, s2):
    
    # too similar
    if model.wv.similarity(s1, s2) > 0.5:
        # print('Too similar: %s, %s' %(s1, s2))
        return True
    
    # contain the same alphabet
    for ele in s1:
        if ele in s2:
            # print('Contain the same alphanet: %s, %s' %(s1, s2))
            return True
        
    return False

def titleCanGen(genre, word_pos_ls):
    
    global special_rule
    
    pos_dict = {}
    for word, pos in word_pos_ls:
        try:
            pos_dict[pos].append(word)
        except:
            pos_dict[pos] = [word]
    
    candidates = []
    
    for rule in special_rule[genre]:
        pos = rule[0]
        for word in pos_dict[pos]:
            if block(rule[1], word):
                continue
            if rule[2]:
                candidates.append(rule[1]+word)
            else:
                candidates.append(word+rule[1])
    
    return candidates

### Title Evaluation

In [8]:
# global variable with predefined value
special_word = [HanziConv.toSimplified(rule[1]) for key, content in special_rule.items() for rule in content]
pos_simpify_dic = {'NN': 'N', 'NR': 'N', 'NT': 'N', 'VE': 'V', 'VV': 'V'}

############################## parse.py ##############################

def Simpify(pos):
    global pos_simpify_dic
    if pos in pos_simpify_dic.keys():
        return pos_simpify_dic[pos]
    return pos

def Parse_String(mvname):
    parse_results = []
    # convert to simple chinese
    text = HanziConv.toSimplified(mvname)
    # parse by core nlp
    global nlp
    output = nlp.annotate(text, properties={'annotators': 'tokenize, ssplit, pos', 'outputFormat': 'json'})
    parse_results.append(output)
    return parse_results

def Get_Parse_Result(parse_results):
    # print(parse_results[0]['sentences'][0]['parse'])
    l = []
    for i in range(len(parse_results)):
        for word in parse_results[i]['sentences'][0]['tokens']:
            global special_word
            if word['word'] in special_word:
                l.append((word['word'], 'N'))
            else:
                l.append((word['word'], Simpify(word['pos'])))
    return l

def POStag(name):
    parse_results = Parse_String(name)
    l = Get_Parse_Result(parse_results)
    return l

############################## feature2.py ##############################

def cosine(v1, v2):
    res = 1 - spatial.distance.cosine(v1, v2)
    return res

def Create_mv_vector(mv_name):
    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    # model = models.Word2Vec.load('med250.model.bin')
    global model
    
    sum_vec = [0] * 250
    for word in mv_name:
        try:
            v = model.wv[word]
            sum_vec = sum_vec + v
        except KeyError:
            sum_vec = sum_vec
    
    return sum_vec

def Most_similar(mv_name):
    mv_name = [HanziConv.toTraditional(x) for x in mv_name]
    mv_vec = Create_mv_vector(mv_name)

    # caculate mv_name_vector
    global mv_list_vec
    # mv_list_vec = pickle.load(open("f2_dict.pkl", "rb"))
    
    # Similarity with those movies
    cosine_list = []
    for mv in mv_list_vec:
        cosine_list.append(cosine(mv_vec, mv))
        # print(cosine(mv_vec, mv))
    return max(cosine_list)

############################## main.py ##############################

def evaluation(mv_name):

    # print(mv_name)
    mv_name = HanziConv.toSimplified(mv_name)
    
    global f1_dict, f3_dict, model
    
    # POS tag
    parse_result = POStag(mv_name)
    # print(parse_result)

    # f1 score
    mv_words = [x[0] for x in parse_result]
    pos_form = [x[1] for x in parse_result]
    pos_form = tuple(pos_form)
    
    if pos_form in f1_dict:
        f1 = f1_dict[pos_form]
    else:
        f1 = 0

    # f2 score
    f2 = Most_similar(mv_words)
    if np.isnan(f2):
        f2 = 0

    # f3 score
    # words = [x[0] for x in parse_result]
    f3 = 0
    for word in mv_words:
        if word in f3_dict:
            f3 = f3 + 0.2 + f3_dict[word]
        else:
            f3 = 0

    # print("f1: %f, f2: %f, f3: %f" %(f1, f2, f3))
    score = f1 + f2 + f3
    # print("score = ", score)
    
    return score

### Title Generation
 - main process of title generation
 - input: a file path
 - output: a sorted title list

In [9]:
def titleGen(path):
    # tfidf
    tf = tfGen(path)
    tfidf = tfidfGen(tf)

    # keyword
    word_ls = keywordExt(tfidf)
    word_pos_ls = keywordSel(word_ls)
    word_ls = [word for word, pos in word_pos_ls]
    # pp.pprint(word_ls)

    # get genre
    genre = ruleBaseClassify(word_ls)
    # print(genre)

    # title candidate generation
    title_candidates = titleCanGen(genre, word_pos_ls)
    # pp.pprint(title_candidates)

    # evaluation
    title_score = [(title, evaluation(title)) for title in title_candidates]
    title_score = sorted(title_score, key=lambda x: x[1], reverse=True)

    return title_score

### Task 1 Main Process
Put all the testing script file in one folder, and then find the best title for each movie.

In [10]:
# folder path
folder = "test"
files = os.listdir(folder)

for file in files:
    path = folder + '\\' + file
    title_score = titleGen(path)
    print(file, title_score[0])

  if sys.path[0] == '':
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


Afanda.txt ('外星人世界', 1.4552664329468497)
La La Land.txt ('失控鼓手', 1.2002845092955547)
