In [1]:
import json

import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from fuzzywuzzy import process
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/karol/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load and label the source data

In [2]:
with open("../make_and_model_clean.json", "r") as fp:
    make_and_model = json.load(fp)
friendly_make_and_model = [f"{model['make']} {model['model']}" for model in make_and_model ]

def get_ngrams(phrase):
    unigrams = nltk.word_tokenize(phrase)
    bigrams = [" ".join(bigram) for bigram in nltk.bigrams(unigrams)]
    return unigrams + bigrams
    
def is_make_and_model_in_text(text):
    text_friendly = text.lower().replace("/", " ").replace("-", " ").replace("_", " ")
    text_ngrams = get_ngrams(text_friendly)
    labels = [
        idx for idx, model in enumerate(make_and_model)
        if model["model"].lower() in text_ngrams and model["make"].lower() in text_ngrams
    ]
    return labels


search_queries_df = pd.read_csv("autotrader.com-organic.Positions-uk-20200507-2020-05-08T15_39_18Z.csv")
search_queries_df["labels"] = search_queries_df["URL"].apply(is_make_and_model_in_text)
search_queries_df.head()

Unnamed: 0,Keyword,Position,Previous position,Search Volume,Keyword Difficulty,CPC,URL,Traffic,Traffic (%),Traffic Cost,Competition,Number of Results,Trends,Timestamp,SERP Features by Keyword,labels
0,autotrader,4,5,5000000,83.02,0.48,https://www.autotrader.com/used-cars,70000,20.06,33600.0,0.02,19500000,"[100,82,100,82,82,82,82,82,82,67,100,100]",1588888110,"Knowledge panel, Site links, Reviews, Video Ca...",[]
1,auto trader,10,9,2240000,89.93,0.48,https://www.autotrader.com/used-cars,13440,3.85,6451.0,0.02,61,"[37,82,20,24,20,100,30,30,30,24,37,20]",1588844263,"Knowledge panel, Top stories, Image pack, Site...",[]
2,autotrader usa,1,1,5400,82.44,0.31,https://www.autotrader.com/,4320,1.23,1339.0,0.04,58,"[67,67,67,67,100,100,82,82,67,67,82,67]",1588876794,"Image pack, Site links, Reviews, Tweet, FAQ",[]
3,ford mustang 1969,1,1,3600,80.11,0.85,https://classics.autotrader.com/classic-cars-f...,2880,0.82,2448.0,0.4,38600000,"[30,30,44,54,81,81,100,81,81,67,100,81]",1588816238,"Image pack, Site links, Reviews, Video Carouse...",[152]
4,1967 chevy impala,1,1,2900,77.92,0.0,https://classics.autotrader.com/classic-cars-f...,2320,0.66,0.0,0.2,93,"[55,43,55,66,82,100,82,82,66,66,82,82]",1588790230,"Image pack, Site links, Reviews, Video Carouse...",[]


In [3]:
multi_labelled_df = search_queries_df[
    search_queries_df["labels"].apply(lambda x: len(x)) > 1
][["Keyword", "URL", "labels" ]]

multi_labelled_df["friendly_label"] = multi_labelled_df["labels"].apply(
    lambda labels: [friendly_make_and_model[i] for i in labels]
)

multi_labelled_df

Unnamed: 0,Keyword,URL,labels,friendly_label
86,ford focus vs fiesta,https://www.autotrader.com/car-reviews/2018-fo...,"[135, 136]","[FORD FIESTA, FORD FOCUS]"
123,kia santa fe,https://www.autotrader.com/car-reviews/2018-ki...,"[793, 836]","[HYUNDAI SANTA FE, KIA SORENTO]"
190,fiesta vs focus,https://www.autotrader.com/car-reviews/2018-fo...,"[135, 136]","[FORD FIESTA, FORD FOCUS]"
276,toyota civic,https://www.autotrader.com/car-reviews/2019-ho...,"[169, 368]","[HONDA CIVIC, TOYOTA COROLLA]"
286,xc40 vs xc60,https://www.autotrader.com/car-reviews/2019-vo...,"[1253, 1254]","[VOLVO XC40, VOLVO XC60]"
...,...,...,...,...
49704,2018 sq5 vs macan s,https://www.autotrader.com/car-reviews/2015-po...,"[24, 353]","[AUDI Q5, PORSCHE MACAN]"
49728,2018 jeep cherokee apple carplay,https://www.autotrader.com/car-reviews/2018-je...,"[818, 821]","[JEEP CHEROKEE, JEEP GRAND CHEROKEE]"
49880,corvette stingray top speed 2017,https://www.autotrader.com/car-reviews/2017-ch...,"[582, 640, 641]","[CHEVROLET CORVETTE, CORVETTE STINGRAY, CORVET..."
49982,bmw 325i dimensions,https://www.autotrader.com/car-reviews/2016-bm...,"[62, 66]","[BMW 3 SERIES, BMW 5 SERIES]"


In [4]:
classification_df = search_queries_df[["Keyword", "labels"]][search_queries_df["labels"].apply(lambda x: len(x))== 1].explode('labels')
classification_df.dropna(how="any", inplace=True)
classification_df

Unnamed: 0,Keyword,labels
3,ford mustang 1969,152
6,1969 mustang,152
9,1967 mustang,152
11,69 mustang,152
12,1968 mustang,152
...,...,...
49987,alfa romeo 4x4 suv,516
49991,eleanor mustang restomod,152
49992,vw durham,464
49993,bmw 330i 2018 specs,62


### Distribution of classes at a glance

In [5]:
classification_df["friendly_label"] = classification_df["labels"].apply(lambda l: friendly_make_and_model[l])
distribution_of_classes_df = classification_df[   ["friendly_label"] ] \
    .groupby("friendly_label") \
    .size() \
    .reset_index(name='counts') \
    .sort_values(by=['counts'], ascending=False)
distribution_of_classes_df

Unnamed: 0,friendly_label,counts
183,FORD MUSTANG,931
100,CHEVROLET CAMARO,499
103,CHEVROLET CORVETTE,378
191,HONDA CIVIC,334
351,PORSCHE 911,317
...,...,...
264,LEXUS LS 430,1
423,VOLKSWAGEN CORRADO,1
203,HYUNDAI I20,1
266,LEXUS LS 600H,1


### Vectorizer for search phrases

In [6]:
vectorizer = CountVectorizer(ngram_range=(1, 2), lowercase=True)

training_phrases, test_phrases, training_labels, test_labels = train_test_split(
    classification_df["Keyword"].astype('string'),
    classification_df["labels"].astype('int64'),
    random_state=1)

training_phrases_cv = vectorizer.fit_transform(training_phrases)
test_phrases_cv = vectorizer.transform(test_phrases)

word_freq_df = pd.DataFrame(training_phrases_cv.toarray(), columns=vectorizer.get_feature_names())
top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)
top_words_df

Unnamed: 0,0
for,2182
sale,2177
for sale,2168
ford,913
bmw,827
...,...
edge horn,1
edge norwalk,1
edge packages,1
edge reviews,1


### Train the model using the training set

In [7]:
from sklearn.naive_bayes import ComplementNB

naive_bayes = ComplementNB(alpha=1, fit_prior=False)
naive_bayes.fit(training_phrases_cv, training_labels)
predictions = naive_bayes.predict(test_phrases_cv)
friendly_nb_classes = [friendly_make_and_model[label] for label in naive_bayes.classes_]

from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy score: ", accuracy_score(test_labels, predictions))
print("Precision score: ", precision_score(test_labels, predictions, average='micro'))
print("Recall score: ", recall_score(test_labels, predictions,  average='micro'))

Accuracy score:  0.8960043787629994
Precision score:  0.8960043787629994
Recall score:  0.8960043787629994


### See top bigrams and unigrams per label

In [8]:
scores = pd.DataFrame(naive_bayes.coef_, columns=list(vectorizer.get_feature_names()))
scores["model"] = pd.Series(friendly_nb_classes, name="model")
scores.set_index('model', inplace=True)
top_words_per_label = scores.apply(lambda s: ", ".join(list(s.nlargest(n=5).index.values)), axis=1)
top_words_per_label.to_frame()

Unnamed: 0_level_0,0
model,Unnamed: 1_level_1
AUDI 100,"100 1975, 100 quattro, audi 100, layton 100, q..."
AUDI A1,"a1, a1 autotrader, a1 for, audi a1, 00"
AUDI A2,"a2, a2 2019, a2 usa, audi a2, 00"
AUDI A3,"2014 vs, 2015 hatchback, 2016 a3, 2017 models, a3"
AUDI A4,"08 audi, 2010 specs, 2013 model, 2014 automati..."
...,...
VOLVO V90,"v90 best, v90 price, v90 t6, 00, 00 ford"
VOLVO XC40,"amazon, amazon blue, avis, avis volvo, blue volvo"
VOLVO XC60,"2010 xc60, 2014 volvo, 2015 xc60, autotrader x..."
VOLVO XC70,"2013 xc70, drive platinum, reliability by, t5 ..."


### See wrongly classified cases in the test set

In [9]:
testing_predictions = [
    friendly_make_and_model[label_id]
    for label_id in predictions
]

actual_labels = [friendly_make_and_model[label] for label in test_labels ]
check_df = pd.DataFrame({"actual_label": actual_labels, "prediction": testing_predictions, "phrase": list(test_phrases)})
print("test set size", check_df.index.size)
wrongly_classified = check_df[check_df["prediction"] != check_df["actual_label"]]
wrongly_classified = wrongly_classified.merge(distribution_of_classes_df, left_on="actual_label", right_on="friendly_label") 
print("unique models", wrongly_classified[wrongly_classified["counts"]==1].index.size )
wrongly_classified.sort_values(by="counts", ascending=False)

test set size 3654
unique models 18


Unnamed: 0,actual_label,prediction,phrase,friendly_label,counts
147,FORD MUSTANG,BUICK RIVIERA,2012 roush stage 3 for sale,FORD MUSTANG,931
146,FORD MUSTANG,LEXUS RC F,sn95 track car,FORD MUSTANG,931
145,FORD MUSTANG,BMW X1,cars with manual transmission and adaptive cru...,FORD MUSTANG,931
144,FORD MUSTANG,FORD GT,jeremy clarkson ford gt,FORD MUSTANG,931
250,CHEVROLET CORVETTE,CHEVROLET CAMARO,chevrolet hampshire,CHEVROLET CORVETTE,378
...,...,...,...,...,...
181,MORGAN AERO,MORGAN 3 WHEELER,morgan aero 8 for sale,MORGAN AERO,1
70,LEXUS GS 300,LEXUS LS 400,1999 lexus gs300 review,LEXUS GS 300,1
95,CHRYSLER 300C,BMW 7 SERIES,chrysler 300 rolls royce conversion kit,CHRYSLER 300C,1
41,JENSEN INTERCEPTOR,JAGUAR XK,jensen cars for sale usa,JENSEN INTERCEPTOR,1


### Try out with phrase correction

In [10]:
phrase_to_check = "how much is mustang"

def correct_phrase(phrase):
    tokens = nltk.word_tokenize(phrase)
    vocabulary = vectorizer.get_feature_names()
    corrected_tokens = [
        process.extractOne(token, vocabulary)[0] 
        for token in tokens 
        if process.extractOne(token, vocabulary)[1] > 97
    ]
    print(corrected_tokens)
    return " ".join(corrected_tokens)

corrected_phrase = correct_phrase(phrase_to_check)
probs = naive_bayes.predict_proba(vectorizer.transform([corrected_phrase]))[0]
print(naive_bayes.get_params())
df = pd.DataFrame({"probs": probs, "model": friendly_nb_classes})
df.sort_values(by="probs", ascending=False).head()

['how', 'much', 'is', 'mustang']
{'alpha': 1, 'class_prior': None, 'fit_prior': False, 'norm': False}


Unnamed: 0,probs,model
75,0.503756,FORD MUSTANG
357,0.001482,MCLAREN P1
102,0.001479,NISSAN LEAF
405,0.00142,TESLA MODEL S
261,0.001408,FISKER KARMA


### Train the final model on the entire labelled dataset

In [14]:
vectorizer_final = CountVectorizer(ngram_range=(1, 2), lowercase=True)
all_phrases_cv = vectorizer_final.fit_transform(classification_df["Keyword"].astype('string'))
naive_bayes_final = ComplementNB(alpha=1, fit_prior=False)
naive_bayes_final.fit(all_phrases_cv, classification_df["labels"].astype('int64'))
friendly_nb_classes_final = [friendly_make_and_model[label] for label in naive_bayes_final.classes_]

### Try out the final model

In [17]:
phrase_to_check_final = "Dino"
corrected_phrase_final = correct_phrase(phrase_to_check_final)
probs_final_check = naive_bayes_final.predict_proba(vectorizer_final.transform([corrected_phrase_final]))[0]
print(naive_bayes_final.get_params())
final_check_df = pd.DataFrame({"probs": probs_final_check, "model": friendly_nb_classes_final})
final_check_df.sort_values(by="probs", ascending=False).head()

['dino']
{'alpha': 1, 'class_prior': None, 'fit_prior': False, 'norm': False}


Unnamed: 0,probs,model
236,0.00439,FERRARI 246
238,0.002924,FERRARI 308
221,0.002195,CORVETTE STINGRAY
354,0.002195,MAYBACH 62
273,0.002195,GMC SYCLONE


### Save the model

In [18]:
from joblib import dump, load

dump(naive_bayes_final, '../model.joblib') 
dump(vectorizer_final, '../vectorizer.joblib') 

['../vectorizer.joblib']