## Import functions

In [1]:
from myfun import *

## Load the 'model'

In [2]:
# Model

granularite = 0.01

dic_of_words = LoadPickle("dic_of_words.p")
WordToRequest = LoadPickle("WordToRequest.p")
df = LoadPickle("df.p")
dic_of_requests = LoadPickle("dic_of_requests.p")

DefaultCategory = df.category.value_counts().index[0]
OverallDisDic = GetOverallCategoryDistribution(df)
Overallp =  DotProductFromDic(OverallDisDic)

PowerDicDis = GetPowerDicDis(Overallp,dic_of_words,granularite)

In [3]:
## Most Powerful words by category

d = AutoVivification()

number_of_words = 0
for word,informations in dic_of_words.items():
    number_of_words = number_of_words + 1
    print("\rWords viewed so far : ",number_of_words,end = "")
    for category,frequence in informations.items():
        d[category][word] = d[category].get(word,0) + frequence
        
Powerful_Words_by_Category = {}
for category,informations in d.items():
    templist = [(word,frequence*PowerDicDis[word]) for word,frequence in informations.items()]
    total_mass = sum([item[1] for item in templist])
    templist = [(item[0],item[1],item[1]/total_mass) for item in templist]
    templist.sort(key = lambda a : -a[1])
    Powerful_Words_by_Category[category] = templist

Words viewed so far :  70172

## Functions used to explore

In [None]:
def list_to_string(b):
    X = []
    for row in b:
        tp = "  |  ".join([str(item) for item in row])
        X.append(tp)
    solution = "\n".join(X)
    return solution

In [4]:
def Explore_Prediction(hyp,IdRequest):

    SelectedRequest, SelectedCategory, SelectedWords = GetInformationsFromIDRequest(IdRequest,df)
    UpdateDic(IdRequest,SelectedRequest, SelectedCategory, SelectedWords,dic_of_words,WordToRequest,True)
    sauvegardePowerDicDis = SavePowerDicDis(SelectedWords,PowerDicDis)
    UpdatePowerDicDis(SelectedWords,dic_of_words,sauvegardePowerDicDis,granularite,Overallp,PowerDicDis,True)

    RequestDic = GetRequestWeight(SelectedRequest,PowerDicDis)
    RequestDic_tuples = [(k,v) for k,v in RequestDic.items()]
    RequestDic_tuples.sort(key=lambda tup: -tup[1])
    ranked_words = [item[0] for item in RequestDic_tuples if item[1]>0.0001]
    ranked_words = Clean_Ranked_Words(ranked_words,WordToRequest)

    PREDICTIONS = ALL_PREDICTIONS_FROM_PARTITIONS(ranked_words,
                                                  RequestDic_tuples,dic_of_requests,WordToRequest,
                                                  hyp,[hyp],DefaultCategory,with_details=True)

    UpdateDic(IdRequest,SelectedRequest, SelectedCategory, SelectedWords,dic_of_words,WordToRequest,False)
    UpdatePowerDicDis(SelectedWords,dic_of_words,sauvegardePowerDicDis,granularite,Overallp,PowerDicDis,False)

    dftoprint = PREDICTIONS[0][2].head(25)
    dftoprint = dftoprint.drop(["score"], axis=1)
    atp = "ID : " + str(IdRequest) 
    btp = "Query : " + str(SelectedRequest)
    ctp = "Prediction : " + str(SelectedCategory==PREDICTIONS[0][1]).upper()
    dtp = "True Category : " + str(SelectedCategory) + " | Predicted Category : " + str(PREDICTIONS[0][1])    
    toprint = atp + "\n" + btp + "\n" + ctp + "\n" + dtp + "\n\nBest Queries Candidates"
    
    return toprint,dftoprint,SelectedCategory,PREDICTIONS[0][1]

In [5]:
def pick_queries_within_category(idcat,nobs=10):
    return df[df.category == idcat].head(nobs) 

## Explore the results

#### **Pick a random query and make a prediction**

In [6]:
idq = np.random.choice(np.arange(len(df)))
#idq = 57906
toprint,dftoprint,SelectedCategory,PredictedCategory = Explore_Prediction(0.85,idq)
print(toprint)
print(dftoprint)

ID : 565519
Query : mtp camo
Prediction : TRUE
True Category : 863 | Predicted Category : 863

Best Queries Candidates
    idrequest                  request  category  score_relatif
0      147267            mtp sas smock       863        0.85186
1       33091              mtp uniform       863        0.85186
2      519813           mtp para smock       863        0.85186
3      452815            mtp chest rig       863        0.85186
4      484499              mtp daysack       863        0.85186
5      537620             mtp day sack      1072        0.85186
6      295575  mtp bergen side pouches      1080        0.85186
7      504218                mtp basha       863        0.85186
8      367066                  mtp kit      1412        0.85186
9      311966               mtp bergen      1080        0.85186
10     104161              mtp webbing      1351        0.85186
11     565412                 mtp army       127        0.85186
12     138852             mtp trousers       863 

#### **List the TOP 15 powerful words of the category the random request belongs to**

In [7]:
print("True Category : ", SelectedCategory)
Powerful_Words_by_Category[SelectedCategory][:15]

True Category :  863


[('army', 13026.752992770767, 0.06495460693975043),
 ('regiment', 9500.71562996453, 0.047372914012659896),
 ('mtp', 6249.50487994793, 0.031161574436111922),
 ('tactical', 4991.637456146344, 0.024889536873053),
 ('military', 4561.173193996981, 0.02274313577332858),
 ('legion', 4134.691959965385, 0.020616595035273952),
 ('tattoo', 3638.9986532719213, 0.018144945813337587),
 ('records', 3628.3896462075195, 0.01809204668457759),
 ('navy', 3407.3464328086234, 0.016989870643395834),
 ('beret', 3251.246814006437, 0.016211519400506084),
 ('artillery', 2905.7180332523785, 0.014488627582974405),
 ('ww2', 2879.4854056460804, 0.014357825224465761),
 ('molle', 2427.6457252713553, 0.012104841011530268),
 ('submarine', 2305.5167476569, 0.011495875773507955),
 ('hms', 2298.8909286149437, 0.011462837803741965)]

#### **Pick some random queries from the category the random request belongs to**

In [8]:
print("True Category : ", SelectedCategory)
pick_queries_within_category(SelectedCategory,nobs=10)

True Category :  863


Unnamed: 0,idrequest,request,category
1600,1600,military gloves,863
2285,2285,army wallpaper,863
2929,2929,military belt,863
3429,3429,military records ww2,863
3775,3775,captain fawcett,863
4339,4339,gas mask bong,863
4412,4412,military surplus,863
6684,6684,ex forces recruitment,863
6994,6994,black water,863
7683,7683,military green,863


#### **List the TOP 15 powerful words of the predicted category**

In [9]:
print("Predicted Category : ", PredictedCategory)
Powerful_Words_by_Category[PredictedCategory][:10]

Predicted Category :  863


[('army', 13026.752992770767, 0.06495460693975043),
 ('regiment', 9500.71562996453, 0.047372914012659896),
 ('mtp', 6249.50487994793, 0.031161574436111922),
 ('tactical', 4991.637456146344, 0.024889536873053),
 ('military', 4561.173193996981, 0.02274313577332858),
 ('legion', 4134.691959965385, 0.020616595035273952),
 ('tattoo', 3638.9986532719213, 0.018144945813337587),
 ('records', 3628.3896462075195, 0.01809204668457759),
 ('navy', 3407.3464328086234, 0.016989870643395834),
 ('beret', 3251.246814006437, 0.016211519400506084)]

#### **Pick some random queries from the category the random request belongs to**

In [10]:
print("Predicted Category : ", PredictedCategory)
pick_queries_within_category(PredictedCategory,nobs=10)

Predicted Category :  863


Unnamed: 0,idrequest,request,category
1600,1600,military gloves,863
2285,2285,army wallpaper,863
2929,2929,military belt,863
3429,3429,military records ww2,863
3775,3775,captain fawcett,863
4339,4339,gas mask bong,863
4412,4412,military surplus,863
6684,6684,ex forces recruitment,863
6994,6994,black water,863
7683,7683,military green,863


### Write some examples of mistakes

In [78]:
ncase = 0
L = []
while ncase<50:
    idq = np.random.choice(np.arange(len(df)))
    toprint,dftoprint,SelectedCategory,PredictedCategory = Explore_Prediction(0.85,idq)
    if SelectedCategory != PredictedCategory:
        ncase = ncase + 1
        
        p2 = "Query : " + dic_of_requests[idq]["request"]
        x = "\n"
        p1 = "Best Candidates of the query"
        p0 = dftoprint.to_string()
        
        a = "Predicted Category : " + str(PredictedCategory)
        b =Powerful_Words_by_Category[PredictedCategory][:15]
        b = list_to_string(b)
        c=pick_queries_within_category(PredictedCategory,nobs=10).to_string()
        
        d = "True Category : " + str(SelectedCategory)
        e =Powerful_Words_by_Category[SelectedCategory][:15]
        e = list_to_string(e)
        f = pick_queries_within_category(SelectedCategory,nobs=10).to_string()
        sepa= "* * * * * * * * * * * * * "
        L.append((p2,x,x,p1,x,p0,x,x,x,x,a,x,x,"POWER WORDS OF THE CATEGORY",x,x,b,x,x,"RANDOM QUERIES OF THE CATEGORY",x,x,c,x,x,x,x,d,x,x,"POWER WORDS OF THE CATEGORY",x,x,e,x,x,"RANDOM QUERIES OF THE CATEGORY",x,x,f,x,x,x,x,x,x,sepa,x))
        
with open("Example_of_mistakes_for_Diagnostics", "w") as fhandle:
    for item in L:
        for element in item:
            fhandle.write(element)