# RTB performance - Part 2

In [46]:
from IPython.display import Math
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# create sample data:
data = {'app_name':  ['CSR Racing 2','Nitro Nation 6','MMX Hill dash','Candy Crush Saga','Fruit Block - Puzzle Legend','Bubble Shooter'],
        'app_description': ['Compete in races against live players across the world with your custom-built supercars, including LaFerrari, McLaren P1™, Koenigsegg One:1 and many more. Tune and customize your rides for maximum speed and dominate the competition in global crew events. Indulge your passion for the most amazing cars on the planet. Other car games can’t compete! Download the ultimate racing game for free, start your supercar collection and get racing now!'
                           ,'Race, mod and tune dozens of real licensed cars. Start a team, invite your friends, win tournaments. Trade parts with other drag racers online and build your dream car.'
                           ,'Race to the finish line over a multitude of racing tracks with hazards, hill climbs, jumps, loops, bridges and ramps. Go turbo with awesome truck upgrades and try to climb to the top of the leaderboard in this crazy MMX racing game that will test your driving skills to the limit!'
                           ,'Plan your moves by matching 3 or more candies in a row, using boosters wisely in order to overcome those extra sticky levels! Smash the chocolate and collect ingredients across thousands of levels guaranteed to have you craving more!'
                           ,'Fruit Block is a funny and juicy match-3 game. Fruit Block is a new play game with impressive game screen and effects. Welcome to the juicy fruit world! Start your journey with other players!'
                           ,'Classic Candy Bubble Shooter is a free game. Its a ancient puzzle and Match-Three game. As a classic game, Candy Bubble shooter is popular at all over the world.'],
        'category': ['racing', 'racing','racing','match3','match3','match3']
        }

df = pd.DataFrame(data)
df

Unnamed: 0,app_description,app_name,category
0,Compete in races against live players across t...,CSR Racing 2,racing
1,"Race, mod and tune dozens of real licensed car...",Nitro Nation 6,racing
2,Race to the finish line over a multitude of ra...,MMX Hill dash,racing
3,Plan your moves by matching 3 or more candies ...,Candy Crush Saga,match3
4,Fruit Block is a funny and juicy match-3 game....,Fruit Block - Puzzle Legend,match3
5,Classic Candy Bubble Shooter is a free game. I...,Bubble Shooter,match3


In [47]:
def extract_nouns_verbs(text):

    words_POS = []
    
    sentences = nltk.tokenize.sent_tokenize(text)
#     print('sentences:', sentences)
    
    for sentence in sentences:
        
        words = nltk.word_tokenize(sentence)
        
        # remove non alphabetic characters from each word
        non_alphabetic = re.compile('[^a-zA-Z]')
        alphabetic_words = []
        
        for word in words:
            word = non_alphabetic.sub('', word)
            # remove word if entirely non-alphabetic
            if word != '':    
                alphabetic_words.append(word)
                
        # add the POS tag to each word
        words_POS.extend(nltk.pos_tag(alphabetic_words))
        
#     print('words', words)    
#     print('alphabetic_words', alphabetic_words)
#     print('words_POS', words_POS)
    
    # keep only Nouns and Verbs
    features = [POS_tuple[0].lower() for POS_tuple in words_POS if 'NN' in POS_tuple[1]]
#     print('features', features)
# 
    # Stem words
    stemmer = nltk.stem.snowball.EnglishStemmer()
    stemmed_features = [stemmer.stem(word) for word in features]
#     print('stemmed_features:', stemmed_features)
    
    return stemmed_features

# Extract featues (nouns and verbs) for the advertisers app and matching app
df['features'] = df['app_description'].apply(lambda x: extract_nouns_verbs(x))
df

Unnamed: 0,app_description,app_name,category,features
0,Compete in races against live players across t...,CSR Racing 2,racing,"[compet, race, player, world, supercar, laferr..."
1,"Race, mod and tune dozens of real licensed car...",Nitro Nation 6,racing,"[race, mod, tune, dozen, car, team, friend, to..."
2,Race to the finish line over a multitude of ra...,MMX Hill dash,racing,"[race, line, multitud, track, hazard, jump, lo..."
3,Plan your moves by matching 3 or more candies ...,Candy Crush Saga,match3,"[move, candi, row, booster, order, level, smas..."
4,Fruit Block is a funny and juicy match-3 game....,Fruit Block - Puzzle Legend,match3,"[fruit, block, juici, match, game, fruit, bloc..."
5,Classic Candy Bubble Shooter is a free game. I...,Bubble Shooter,match3,"[candi, bubbl, shooter, game, puzzl, matchthre..."


## Apply cosine similarity

In [48]:
Math(r'cos(\pmb x, \pmb y) = \frac {\pmb x \cdot \pmb y}{||\pmb x|| \cdot ||\pmb y||}')

<IPython.core.display.Math object>

In [50]:
def compute_similarity(df, min_frequency, max_frequency, target_app):

    # move app to compare against to first row re-arranging the df index
    df = df.set_index(keys='app_name', drop=False)
    index_nr = df.index.get_loc(target_app)
    df = df.iloc[[index_nr] + [i for i in range(len(df)) if i != index_nr]]
    df = df.reset_index(drop=True)

    # create tf-idf matrix to compute cosine similarity
    tfidf = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, use_idf=True, norm="l2",
                            min_df=min_frequency, max_df=max_frequency, ngram_range=(1, 1), lowercase=False)


    tfidf_matrix = tfidf.fit_transform(df['features'])

    similarity_matrix = cosine_similarity(tfidf_matrix)

    # add result to original df
    app_names = [name for name in df["app_name"]]
    dist_matrix = pd.DataFrame(similarity_matrix, columns=app_names)

    final_table = df.merge(dist_matrix.head(1).transpose(), left_on='app_name', right_index=True, how='left')
    final_table = final_table.rename(columns={0: "similarity"})

    output = final_table.sort_values(by='similarity', ascending=False).reset_index()
    
    return output


# minimum/maximum frequency a feature has to occur to be considered for tf-idf matrix
output = compute_similarity(df, target_app = 'Nitro Nation 6', min_frequency = 0.3, max_frequency = 1.0)
output

Unnamed: 0,index,app_description,app_name,category,features,similarity
0,0,"Race, mod and tune dozens of real licensed car...",Nitro Nation 6,racing,"[race, mod, tune, dozen, car, team, friend, to...",1.0
1,1,Compete in races against live players across t...,CSR Racing 2,racing,"[compet, race, player, world, supercar, laferr...",0.7655
2,2,Race to the finish line over a multitude of ra...,MMX Hill dash,racing,"[race, line, multitud, track, hazard, jump, lo...",0.324682
3,3,Plan your moves by matching 3 or more candies ...,Candy Crush Saga,match3,"[move, candi, row, booster, order, level, smas...",0.0
4,4,Fruit Block is a funny and juicy match-3 game....,Fruit Block - Puzzle Legend,match3,"[fruit, block, juici, match, game, fruit, bloc...",0.0
5,5,Classic Candy Bubble Shooter is a free game. I...,Bubble Shooter,match3,"[candi, bubbl, shooter, game, puzzl, matchthre...",0.0


The minimum frequency is an important tuning paramter and needs to be experimented with. Raising it makes the metric less prone to rare words, thus more resiliant to noise in the data.