# Hands on: Text classification walkthrough

### Input data

In [85]:
from IPython.display import Math
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# create sample data:
data = {'app_name':  ['CSR Racing 2','Nitro Nation 6','MMX Hill dash','Candy Crush Saga','Fruit Block - Puzzle Legend','Bubble Shooter'],
        'app_description': ['Compete in races against live players across the world with your custom-built supercars, including LaFerrari, McLaren P1™, Koenigsegg One:1 and many more. Tune and customize your rides for maximum speed and dominate the competition in global crew events. Indulge your passion for the most amazing cars on the planet. Other car games can’t compete! Download the ultimate racing game for free, start your supercar collection and get racing now!'
                           ,'Race, mod and tune dozens of real licensed cars. Start a team, invite your friends, win tournaments. Trade parts with other drag racers online and build your dream car.'
                           ,'Race to the finish line over a multitude of racing tracks with hazards, hill climbs, jumps, loops, bridges and ramps. Go turbo with awesome truck upgrades and try to climb to the top of the leaderboard in this crazy MMX racing game that will test your driving skills to the limit!'
                           ,'Plan your moves by matching 3 or more candies in a row, using boosters wisely in order to overcome those extra sticky levels! Smash the chocolate and collect ingredients across thousands of levels guaranteed to have you craving more!'
                           ,'Fruit Block is a funny and juicy match-3 game. Fruit Block is a new play game with impressive game screen and effects. Welcome to the juicy fruit world! Start your journey with other players!'
                           ,'Classic Candy Bubble Shooter is a free game. Its a ancient puzzle and Match-Three game. As a classic game, Candy Bubble shooter is popular at all over the world.'],
        'category': ['racing', 'racing','racing','match3','match3','match3']
        }

df = pd.DataFrame(data)
df[['app_name','category','app_description']]

Unnamed: 0,app_name,category,app_description
0,CSR Racing 2,racing,Compete in races against live players across t...
1,Nitro Nation 6,racing,"Race, mod and tune dozens of real licensed car..."
2,MMX Hill dash,racing,Race to the finish line over a multitude of ra...
3,Candy Crush Saga,match3,Plan your moves by matching 3 or more candies ...
4,Fruit Block - Puzzle Legend,match3,Fruit Block is a funny and juicy match-3 game....
5,Bubble Shooter,match3,Classic Candy Bubble Shooter is a free game. I...


### Feature extraction

In [86]:
def extract_features(text):

    words_POS = []
    sentences = nltk.tokenize.sent_tokenize(text)
    
    for sentence in sentences:
        
        words = nltk.word_tokenize(sentence)
        
        # remove non alphabetic characters from each word
        non_alphabetic = re.compile('[^a-zA-Z]')
        alphabetic_words = []
        
        for word in words:
            word = non_alphabetic.sub('', word)
            # remove word if entirely non-alphabetic
            if word != '':    
                alphabetic_words.append(word)
                
        # add the POS tag to each word
        words_POS.extend(nltk.pos_tag(alphabetic_words))
    
    # keep nouns only
    features = [POS_tuple[0].lower() for POS_tuple in words_POS if 'NN' in POS_tuple[1]]
 
    # stem words
    stemmer = nltk.stem.snowball.EnglishStemmer()
    stemmed_features = [stemmer.stem(word) for word in features]
    
    return stemmed_features


# extract features (nouns) for all apps
df['features'] = df['app_description'].apply(lambda x: extract_features(x))
df[['app_name','category','app_description','features']]

Unnamed: 0,app_name,category,app_description,features
0,CSR Racing 2,racing,Compete in races against live players across t...,"[compet, race, player, world, supercar, laferr..."
1,Nitro Nation 6,racing,"Race, mod and tune dozens of real licensed car...","[race, mod, tune, dozen, car, team, friend, to..."
2,MMX Hill dash,racing,Race to the finish line over a multitude of ra...,"[race, line, multitud, track, hazard, jump, lo..."
3,Candy Crush Saga,match3,Plan your moves by matching 3 or more candies ...,"[move, candi, row, booster, order, level, smas..."
4,Fruit Block - Puzzle Legend,match3,Fruit Block is a funny and juicy match-3 game....,"[fruit, block, juici, match, game, fruit, bloc..."
5,Bubble Shooter,match3,Classic Candy Bubble Shooter is a free game. I...,"[candi, bubbl, shooter, game, puzzl, matchthre..."


### Apply cosine similarity

In [87]:
def compute_similarity(df, min_frequency, max_frequency, target_app):

    # move app to compare against to first row re-arranging the df index
    df = df.set_index(keys='app_name', drop=False)
    index_nr = df.index.get_loc(target_app)
    df = df.iloc[[index_nr] + [i for i in range(len(df)) if i != index_nr]]
    df = df.reset_index(drop=True)

    # create tf-idf matrix to compute cosine similarity
    tfidf = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                            use_idf=True, norm="l2", min_df=min_frequency, 
                            max_df=max_frequency, ngram_range=(1, 1), lowercase=False)


    tfidf_matrix = tfidf.fit_transform(df['features'])

    similarity_matrix = cosine_similarity(tfidf_matrix)

    # add result to original df
    app_names = [name for name in df["app_name"]]
    dist_matrix = pd.DataFrame(similarity_matrix, columns=app_names)

    final_table = df.merge(dist_matrix.head(1).transpose(), left_on='app_name', 
                           right_index=True, how='left')
    final_table = final_table.rename(columns={0: "similarity"})

    output = final_table.sort_values(by='similarity', ascending=False).reset_index()
    
    return output


# minimum/maximum frequency a feature has to occur to be considered for tf-idf matrix
df = compute_similarity(df, target_app = 'Nitro Nation 6',
                        min_frequency = 0.3, max_frequency = 1.0)
print('Cosine similarity, comparison to: Nitro Nation 6')
df[['app_name','category','features','similarity']]

Cosine similarity, comparison to: Nitro Nation 6


Unnamed: 0,app_name,category,features,similarity
0,Nitro Nation 6,racing,"[race, mod, tune, dozen, car, team, friend, to...",1.0
1,CSR Racing 2,racing,"[compet, race, player, world, supercar, laferr...",0.7655
2,MMX Hill dash,racing,"[race, line, multitud, track, hazard, jump, lo...",0.324682
3,Candy Crush Saga,match3,"[move, candi, row, booster, order, level, smas...",0.0
4,Fruit Block - Puzzle Legend,match3,"[fruit, block, juici, match, game, fruit, bloc...",0.0
5,Bubble Shooter,match3,"[candi, bubbl, shooter, game, puzzl, matchthre...",0.0


The minimum frequency is an important tuning paramter and needs to be experimented with.

### Text classification
The CountVectorizer and TfidfTransformer methods are doing the minimal feature extraction needed.

In [88]:
def train_classifier(X_train, y_train):
    
    text_clf = Pipeline([ ('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()) ])

    # Grid search using Stratified K validation:
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                   'vect__max_df': (1, 9e-1,8e-1,7e-1),
                  'clf__alpha': (1e-1, 1e-2, 1e-3),}
    
    clf = GridSearchCV(text_clf, parameters, cv = 2)

    clf = clf.fit(X_train, y_train)

    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, clf.best_params_[param_name]))
    print('Resulting in the following score:',clf.best_score_)

    return clf

X_train = df['app_description']
y_train = df['category']

# find best classifier
clf = train_classifier(X_train, y_train)

# save tranformer and count vectorizer for new samples
ngram_range = clf.best_params_['vect__ngram_range']
use_idf = clf.best_params_['tfidf__use_idf']
alpha = clf.best_params_['clf__alpha']

count_vect = CountVectorizer(ngram_range=ngram_range)
count_vect._validate_vocabulary()

X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(use_idf=use_idf).fit(X_train_counts)

# train classifier using best parameters
X_train_tf = tfidf_transformer.transform(X_train_counts)
best_clf = MultinomialNB(alpha=alpha).fit(X_train_tf, y_train)

clf__alpha: 0.1
tfidf__use_idf: True
vect__max_df: 1
vect__ngram_range: (1, 1)
Resulting in the following score: 0.833333333333


In [89]:
# create new sample:
data = {'app_name':  ['GTR Speed Rivals','Diamond Diaries Saga','Solitaire'],
        'app_description': ['Online drift racing game with ultra graphics! We worked hard to create the perfect drifting physics! Compete with other players for points and speed in multiplayer, level up your car, improve the engine, change the suspension, upgrade the wheels for better road grip, put beautiful rims on them, reduce the weight of your car’s body, and enhance the NITRO TYPE!', 
                            'The makers of Candy Crush Saga and Farm Heroes Saga are back with even bigger Cropsies in Farm Heroes Super Saga! Match 3 to collect Cropsies and help the Farm Heroes in this fun puzzle game!',
                            'Join over 100 million users playing our Solitaire for Android! Our version of Solitaire is free and is the most popular in the Play Store! Take on our Daily Challenges for a new Solitaire experience each and every day.']
        }

new_sample = pd.DataFrame(data)
new_sample[['app_name','app_description']]

Unnamed: 0,app_name,app_description
0,GTR Speed Rivals,Online drift racing game with ultra graphics! ...
1,Diamond Diaries Saga,The makers of Candy Crush Saga and Farm Heroes...
2,Solitaire,Join over 100 million users playing our Solita...


In [90]:
# apply count vectorizer and tfidf transformer to new sample
X_new_counts = count_vect.transform(new_sample['app_description'])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# make predictions using best model
predicted = best_clf.predict(X_new_tfidf)
predicted_probs = best_clf.predict_proba(X_new_tfidf)
predicted_probs = pd.DataFrame([max(i) for i in predicted_probs]) 
predictions = pd.concat([pd.Series(predicted), predicted_probs], axis=1)
predictions.columns = ['predicted_app_group', 'probability']
new_sample = pd.concat([new_sample, predictions], axis=1)
new_sample['probability'] = new_sample['probability'].apply(lambda x: round(x * 100))
new_sample[['app_name','app_description','predicted_app_group','probability']]

Unnamed: 0,app_name,app_description,predicted_app_group,probability
0,GTR Speed Rivals,Online drift racing game with ultra graphics! ...,racing,91
1,Diamond Diaries Saga,The makers of Candy Crush Saga and Farm Heroes...,match3,76
2,Solitaire,Join over 100 million users playing our Solita...,match3,62


The app store description texts contain enough information to classify them into content categories!