#### Buffalo's  word2vec only supports skip-gram  word2vec algorithm (No HS)

In [2]:
from buffalo import W2V, W2VOption
from buffalo import StreamOptions
from buffalo import aux, log
log.set_log_level(1) # set log level 3 or higher to check more information

In [3]:
opt = W2VOption().get_default_option()
opt.num_iters = 15
opt.num_workers = 8
opt.d = 100
opt.min_count = 2
opt.num_negative_samples = 5           # initialize default Word2vec option
opt                                    # Check buffalo/algo/options.py to see further.

{'evaluation_on_learning': False,
 'compute_loss_on_training': True,
 'early_stopping_rounds': 0,
 'save_best': False,
 'evaluation_period': 1,
 'save_period': 10,
 'random_seed': 0,
 'validation': {},
 'num_workers': 8,
 'num_iters': 15,
 'd': 100,
 'window': 5,
 'min_count': 2,
 'sample': 0.001,
 'num_negative_samples': 5,
 'lr': 0.025,
 'min_lr': 0.0001,
 'model_path': '',
 'data_opt': {}}

In [4]:
data_opt = StreamOptions().get_default_option()
data_opt.input.main = 'data/ml-1m/stream'
data_opt.input.iid = 'data/ml-1m/iid'

In [5]:
w2v_model = W2V(opt, data_opt=data_opt)
w2v_model.initialize()

In [6]:
w2v_model.train()

{}

### Similar movies to Lion King

In [7]:
movie_name = "Lion_King,_The_(1994)"
print('Similar movies to', movie_name)
similar_items = w2v_model.most_similar(movie_name, 5)
for rank, (movie_name, score) in enumerate(similar_items):
    print(f'{rank + 1:02d}. {score:.3f} {movie_name}')

Similar movies to Lion_King,_The_(1994)
01. 0.792 Hunchback_of_Notre_Dame,_The_(1996)
02. 0.722 Mulan_(1998)
03. 0.721 Beauty_and_the_Beast_(1991)
04. 0.713 Hercules_(1997)
05. 0.688 Anastasia_(1997)


Results are all disney animations, as you expected. word2vec quite work well in recommendation domain.

### feature vector of SF movies

In [8]:
sf_wv = w2v_model.get_weighted_feature(
    {
        'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)': 0.3,
        'Stargate_(1994)': 0.3,
        'Starship_Troopers_(1997)' : 0.3
    }
)
movie_names_to_filter = [
    'Star_Wars:_Episode_IV_-_A_New_Hope_(1977)',
    'Stargate_(1994)',
    'Starship_Troopers_(1997)'
] 

for i, (movie_name, score) in enumerate(w2v_model.most_similar(sf_wv, 10)):
    if movie_name in movie_names_to_filter:
        continue
    print(i, "movie name:", movie_name, "score:", score)

2 movie name: Star_Trek:_Generations_(1994) score: 0.7971603
3 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.7936294
4 movie name: Rocketeer,_The_(1991) score: 0.76695454
5 movie name: Star_Trek:_Insurrection_(1998) score: 0.71424824
6 movie name: Superman_II_(1980) score: 0.69101596
7 movie name: Star_Trek:_First_Contact_(1996) score: 0.6870025
8 movie name: Demolition_Man_(1993) score: 0.6859702
9 movie name: Deep_Impact_(1998) score: 0.67796385


summing three SF movies, namely Star wars, Stargate, and Starship Troopers gives Star Trek!

### Arithmetic among features

In [9]:
animation_wv = w2v_model.get_weighted_feature(
    {
        "Bug's_Life,_A_(1998)": 1,
        'Wallace_&_Gromit:_The_Best_of_Aardman_Animation_(1996)': 1,
        'Sleeping_Beauty_(1959)': 1,
        'Toy_Story_(1995)': 1,
        'South_Park:_Bigger,_Longer_and_Uncut_(1999)': 1,
        'Creature_Comforts_(1990)': 1,
        'Lion_King,_The_(1994)': 1,
        'Mulan_(1998)': 1
    }
)

First we need to sum and normalize vectors of animations.

In [10]:
drama_wv = w2v_model.get_weighted_feature(
    {
        'Ben-Hur_(1959)': 0.3,
        'Kolya_(1996)' : 0.3,
        'Shall_We_Dance?_(Shall_We_Dansu?)_(1996)': 0.3
    }
)

Next we generate weight vector of drama genre by summing vectors of drama movies

Then subtract animation weight vector from Toy Story, then add drama vector 

In [11]:
wv = animation_wv + sf_wv

In [12]:
movie_names_to_filter = [
    'Toy_Story_(1995)',
    'Ben-Hur_(1959)',
    'Kolya_(1996)',
    'Shall_We_Dance?_(Shall_We_Dansu?)_(1996)'
]

for i, (movie_name, score) in enumerate(w2v_model.most_similar(wv, 10)):
    if movie_name in movie_names_to_filter:
        continue
    print(i, "movie name:", movie_name, "score:", score)

0 movie name: Metisse_(Caf_au_Lait)_(1993) score: 0.70666397
1 movie name: Stargate_(1994) score: 0.68651146
2 movie name: Mulan_(1998) score: 0.6690214
3 movie name: Callejn_de_los_milagros,_El_(1995) score: 0.65673596
4 movie name: Ghost_in_the_Shell_(Kokaku_kidotai)_(1995) score: 0.6557385
5 movie name: Star_Trek_VI:_The_Undiscovered_Country_(1991) score: 0.64872766
6 movie name: Starship_Troopers_(1997) score: 0.6440331
7 movie name: Star_Trek:_Generations_(1994) score: 0.640712
8 movie name: Lion_King,_The_(1994) score: 0.6243777
9 movie name: Rocketeer,_The_(1991) score: 0.6216627


We only find that "Ghost_in_the_Shell_(Kokaku_kidotai)_(1995)" in the high scored items, which is SF animation.
We conjectured that arithmetic operations doesn't work in item recommendation domain.