In [1]:
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from natsort import index_natsorted
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


##### This file is generated from this <a href="02_basic_analysis.ipynb">Notebook</a>

In [19]:
reviews = pd.read_csv("../../data/reviews.csv")
reviews = reviews.drop(columns=["Unnamed: 0"])
reviews.dropna(subset='reviewText', inplace=True)

  reviews = pd.read_csv("../../data/reviews.csv")


In [10]:
reviews.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,vote
0,5.0,False,"07 23, 2008",A8IOST6U6WH9B,615179088,C. Radey,human japanese truly superb introduction outsi...,human japanese,12
1,5.0,False,"06 4, 2008",A1MUV9F35OROS5,615179088,D. Abel,got human japanese demo website . within week ...,best japanese program available,11
2,4.0,False,"04 8, 2008",A27PAMABWVQ892,615179088,piepiepie75,first experience human japanese first version ...,better human japanese 1 ... much changed .,99
3,5.0,False,"03 26, 2008",A3HWWVK0L3JEKF,615179088,K. Grier,first language software purchased love ! way i...,great product,4
4,5.0,False,"02 20, 2008",A3NO2V2JU4Y8UY,615179088,H. Granat,human japanese best pc program learning japane...,love !,2


In [28]:
reviews.shape

(400500, 9)

In [21]:
softwares = pd.read_csv("../../data/softwares.csv")
softwares = softwares.drop(columns=["Unnamed: 0"])

In [29]:
reviews = reviews.merge(softwares, on='asin', how='inner')
reviews = reviews[['overall', 'verified', 'reviewTime', 'reviewerID', 'asin','reviewerName', 'reviewText', 'summary', 'vote']]

In [30]:
reviews.shape

(145784, 9)

In [31]:
softwares.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3698 entries, 0 to 3697
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           3698 non-null   object 
 1   tech1              1 non-null      object 
 2   description        3698 non-null   object 
 3   fit                0 non-null      float64
 4   title              3698 non-null   object 
 5   also_buy           3698 non-null   object 
 6   tech2              0 non-null      float64
 7   brand              3685 non-null   object 
 8   feature            3698 non-null   object 
 9   rank               3698 non-null   object 
 10  also_view          3698 non-null   object 
 11  main_cat           3698 non-null   object 
 12  similar_item       0 non-null      float64
 13  date               3505 non-null   object 
 14  price              3698 non-null   float64
 15  asin               3698 non-null   object 
 16  details            3622 

In [14]:
def preprocess_text(text):
    # Tokenize the text
    text = str(text)
    text = text.replace("<div>",'')
    text = text.replace("< div >",'')
    text = text.replace("</div>",'')
    text = text.replace("< /div >",'')
    text = text.replace("< br/ >",'') 
    text = text.replace("< br >",'')
    text = text.replace("< b >",'')
    text = text.replace("< /b >",'')
    text = text.replace("``",'')
    text = text.replace("< strong >",'')
    text = text.replace("< /strong >",'')
    tokens = word_tokenize(text.lower())
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [4]:
softwares['title'] = softwares['title'].apply(preprocess_text)
softwares['summary'] = softwares['summary'].apply(preprocess_text)
softwares['description'] = softwares['description'].apply(preprocess_text)
softwares['category'] = softwares['category'].apply(preprocess_text)

In [2]:
# To ease the computation time, the data was saved intermittently and loaded for subsequent use
#review_meta.to_csv("../../data/review_metadata_with_embeddings.csv")
review_meta = pd.read_csv("../../data/review_metadata_with_embeddings.csv")
review_meta.drop(columns=["Unnamed: 0.1","Unnamed: 0"],inplace=True)
review_meta.head()

  review_meta = pd.read_csv("../../data/review_metadata_with_embeddings.csv")


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,vote,category,...,also_view,main_cat,similar_item,date,price,details,software_category,Licensing_Fee,Implemention_cost,Maintenance_cost
0,5.0,False,"07 23, 2008",A8IOST6U6WH9B,615179088,C. Radey,Human Japanese is a truly superb introduction ...,human japanese,12,"[ 'software ' , 'education & reference ' , 'la...",...,"['B00N5EXLMC', '0976998122', '4789014401', '06...",Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994
1,5.0,False,"06 4, 2008",A1MUV9F35OROS5,615179088,D. Abel,I got Human Japanese as a demo from its websit...,best japanese program available,11,"[ 'software ' , 'education & reference ' , 'la...",...,"['B00N5EXLMC', '0976998122', '4789014401', '06...",Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994
2,4.0,False,"04 8, 2008",A27PAMABWVQ892,615179088,piepiepie75,My first experience with Human Japanese was th...,better human japanese 1 ... much changed .,99,"[ 'software ' , 'education & reference ' , 'la...",...,"['B00N5EXLMC', '0976998122', '4789014401', '06...",Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994
3,5.0,False,"03 26, 2008",A3HWWVK0L3JEKF,615179088,K. Grier,This is the first language software that I hav...,great product,4,"[ 'software ' , 'education & reference ' , 'la...",...,"['B00N5EXLMC', '0976998122', '4789014401', '06...",Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994
4,5.0,False,"02 20, 2008",A3NO2V2JU4Y8UY,615179088,H. Granat,Human japanese is the best pc program for lear...,love !,2,"[ 'software ' , 'education & reference ' , 'la...",...,"['B00N5EXLMC', '0976998122', '4789014401', '06...",Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994


### Generate Embeddings for text features

In [32]:
name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(name)

In [33]:
def get_embeddings(text):
    sentences = list(text)
    return model.encode(sentences)

<br> Realized that the description and asin were unique for each software, hence we have only 650 distinct softwares.

In [34]:
reviews.dropna(subset='reviewText', inplace=True)

In [35]:
from transformers import pipeline
# Output labels
candidate_labels = ["positive", "negative"]
# device=0 for GPU usage
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device=0)


def compute_total_sentiment_scores(df):
    postive_score = 0
    negative_score = 0
    for index, row in df.iterrows():
        input_text = row['reviewText']

        model_dict = classifier(input_text, candidate_labels, multi_label=True)

        # Zip results to dict
        result_dict = dict(zip(model_dict.get('labels'), model_dict.get('scores')))
        # get confidence scores
        postive_score += result_dict.get('positive')
        negative_score += result_dict.get('negative')
    nos_records = df.shape[0]
    return postive_score, negative_score, nos_records

In [36]:
unique_ids = reviews.asin.unique().tolist()
scores_dict = {
"asin": [],
"postive_score":[],
"negative_score": [],
"number_reviews": []
}
for id in unique_ids:
    postive_score, negative_score, len_df = compute_total_sentiment_scores(reviews[reviews.asin == id])
    scores_dict['asin'].append(id)
    scores_dict['postive_score'].append(postive_score)
    scores_dict['negative_score'].append(negative_score)
    scores_dict['number_reviews'].append(len_df)
    
scores_df = pd.DataFrame.from_dict(scores_dict)
scores_df.head()



Unnamed: 0,asin,postive_score,negative_score,number_reviews
0,615179088,99.150372,15.194917,118
1,763855553,668.06989,283.893408,1012
2,989614026,27.816113,8.228551,41
3,1413313701,63.86515,26.887662,98
4,1413308171,13.647527,3.149989,20


In [18]:
software_data = review_meta.groupby(['title']).agg({'asin':"first",'description':'first','price':'first','Licensing_Fee':'first','Implemention_cost':'first','Maintenance_cost':'first'}).reset_index()
software_data['description'] = software_data['description'].apply(preprocess_text)
software_data = software_data.merge(scores_df, on='asin', how='inner')

In [19]:
software_data.head()

Unnamed: 0,title,asin,description,price,Licensing_Fee,Implemention_cost,Maintenance_cost,postive_score,negative_score,number_reviews
0,& amp ; 's : lost formula,B00079VRTW,[ 'the lost formula put kid driver\ 's seat en...,55.32,0.704,27.66,5.532,4.763885,2.767745,8
1,& amp ; defrag 16 professional edition [ downl...,B00EIRVO7O,[ 'by combing file fragements scattered across...,29.95,0.0,14.975,2.995,0.000407,0.996757,1
2,& amp ; diskimage 7 professional edition [ dow...,B00EIRWNZG,[ backup restoration made easy . - even comput...,29.95,0.0,14.975,2.995,0.016863,1.990709,2
3,& amp ; partitionmanager 2 professional edition,B001HN6J7C,[ ' & partitionmanager advanced tool allows sp...,24.03,2.904,12.015,2.403,0.716291,0.024232,1
4,& amp ; rescuebox 5 personal edition '' / >,B001HN6J7M,[ 'the new & rescuebox 5 data recovery bundle ...,155.5,2.904,77.75,15.55,0.057977,0.994954,1


In [24]:
software_data['description_embeddings'] = software_data['description'].apply(lambda x: get_embeddings(x))
software_data['title_embeddings'] = software_data['title'].apply(lambda x: get_embeddings(x))

In [25]:
software_data.to_csv("../../data/softwares_only.csv")

In [34]:
software_data = software_data[['asin','description_embeddings']]
software_data.head()

Unnamed: 0,asin,description_embeddings
0,B00079VRTW,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
1,B00EIRVO7O,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
2,B00EIRWNZG,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
3,B001HN6J7C,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
4,B001HN6J7M,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...


In [41]:
len(review_meta)

145834

In [42]:
review_meta =  review_meta.merge(software_data, on='asin', how='inner')

In [43]:
review_meta.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,vote,category,...,main_cat,similar_item,date,price,details,software_category,Licensing_Fee,Implemention_cost,Maintenance_cost,description_embeddings
0,5.0,False,"07 23, 2008",A8IOST6U6WH9B,615179088,C. Radey,Human Japanese is a truly superb introduction ...,human japanese,12,"[ 'software ' , 'education & reference ' , 'la...",...,Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
1,5.0,False,"06 4, 2008",A1MUV9F35OROS5,615179088,D. Abel,I got Human Japanese as a demo from its websit...,best japanese program available,11,"[ 'software ' , 'education & reference ' , 'la...",...,Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
2,4.0,False,"04 8, 2008",A27PAMABWVQ892,615179088,piepiepie75,My first experience with Human Japanese was th...,better human japanese 1 ... much changed .,99,"[ 'software ' , 'education & reference ' , 'la...",...,Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
3,5.0,False,"03 26, 2008",A3HWWVK0L3JEKF,615179088,K. Grier,This is the first language software that I hav...,great product,4,"[ 'software ' , 'education & reference ' , 'la...",...,Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...
4,5.0,False,"02 20, 2008",A3NO2V2JU4Y8UY,615179088,H. Granat,Human japanese is the best pc program for lear...,love !,2,"[ 'software ' , 'education & reference ' , 'la...",...,Software,,</div>,39.94,,Education & Reference,0.008,19.97,3.994,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...


In [44]:
len(review_meta)

141783

### 

### Correcting ratings with Review Text
We use kmeans to cluster the tfidvector of all reviewtext to correct the ratings. 

In [195]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(review_meta['reviewText']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

(15395, 16647)


In [196]:
km = KMeans(5)
km.fit(tfidf_matrix)



In [197]:
ls_val = km.labels_.tolist()

In [198]:
clusters= { 'cluster_1' : ls_val.count(0),
           'cluster_2' : ls_val.count(1),
           'cluster_3' : ls_val.count(2),
           'cluster_4' : ls_val.count(3),
           'cluster_5' : ls_val.count(4),
}

In [199]:
clusters

{'cluster_1': 7575,
 'cluster_2': 448,
 'cluster_3': 310,
 'cluster_4': 6139,
 'cluster_5': 923}

In [200]:
# Add kmeans clusters
review_meta['clusters'] = ls_val

#### For each cluster we count the numbe of positives and negatives in them the use to encode ratings 1 - 5

In [201]:
# Output labels
candidate_labels = ["positive", "negative"]

# device=0 for GPU usage
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device=0)


def count_predictions(df):
    predictions = []
    # Iterate over each row in the DataFrame
    for i in range(len(df)):
        input_text = df['reviewText'].iloc[i]

        # Perform zero-shot classification
        model_dict = classifier(input_text, candidate_labels, multi_label=True)

        # Zip results to dict
        result_dict = dict(zip(model_dict['labels'], model_dict['scores']))

        # Assign the highest scoring label as the predicted label
        predicted_label = max(result_dict, key=result_dict.get)
        predictions.append(predicted_label)

    return {"postive":predictions.count('positive'), "negative":predictions.count('negative')}
    

In [202]:
## For cluster 1
count_predictions(review_meta[review_meta.clusters == 0])



{'postive': 5287, 'negative': 2288}

In [203]:
5287 / (5287 + 2288)

0.7003576632666578

In [204]:
## For cluster 2
count_predictions(review_meta[review_meta.clusters == 1])

{'postive': 447, 'negative': 1}

In [206]:
447 / (447 + 1)

0.9977678571428571

In [207]:
## For cluster 3
count_predictions(review_meta[review_meta.clusters == 2])



{'postive': 305, 'negative': 5}

In [208]:
305 / (305 +5)

0.9838709677419355

In [209]:
## For cluster 4
count_predictions(review_meta[review_meta.clusters == 3])

{'postive': 3631, 'negative': 2508}

In [211]:
3631 / (3631 + 2508)

0.5914644078840202

In [210]:
## For cluster 5
count_predictions(review_meta[review_meta.clusters == 4])

{'postive': 870, 'negative': 53}

In [212]:
870 / (870 + 53)

0.942578548212351

##### We assigned ratings based on the group with the realtive highest positive sentiments

In [213]:
rating_map = {0:2,
 1:5,
 2:4,
 3:1,
 4:3}

In [214]:
review_meta['ratings'] = review_meta['clusters'].map(rating_map)

In [215]:
review_meta.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,vote,category,...,software_category,Licensing_Fee,Implemention_cost,Maintenance_cost,summary_embeddings,title_embeddings,description_embeddings_x,clusters,ratings,description_embeddings_y
0,5.0,True,"12 2, 2014",A2Q3KBSVA7F56Z,615179088,Alvin Truthfinder,Very easy product to use. It's clear. Concise....,five star,,"[ 'software ' , 'education & reference ' , 'la...",...,Education & Reference,0.008,19.97,3.994,[[-0.16857387 -0.5855289 2.4885037 ... 0.2...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,"[[0.09625992, -0.18804407, 2.2306604, 0.070576...",4,3,"[[0.09625992, -0.18804407, 2.2306604, 0.070576..."
1,5.0,True,"11 14, 2014",A32OSD6FHVIK4I,615179088,Mandy Wells,I love the Human Japanese software. I had a li...,love !,,"[ 'software ' , 'education & reference ' , 'la...",...,Education & Reference,0.008,19.97,3.994,[[-0.21905293 -0.86112505 1.9966931 ... -0.0...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,"[[0.09625992, -0.18804407, 2.2306604, 0.070576...",3,1,"[[0.09625992, -0.18804407, 2.2306604, 0.070576..."
2,3.0,True,"11 8, 2014",A37RK6IW147989,615179088,Wendy Lee Riggsbee,"only problem, daughter lost a code and ow cann...",three star,,"[ 'software ' , 'education & reference ' , 'la...",...,Education & Reference,0.008,19.97,3.994,[[ 0.24971181 -0.2821653 2.1901054 ... 0.1...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,"[[0.09625992, -0.18804407, 2.2306604, 0.070576...",0,2,"[[0.09625992, -0.18804407, 2.2306604, 0.070576..."
3,4.0,True,"09 22, 2014",A12ZEYNVT3PCAG,615179088,Ean P.,"For an intro to learning Japanese, I found it ...",say satisfied overall presentation,,"[ 'software ' , 'education & reference ' , 'la...",...,Education & Reference,0.008,19.97,3.994,[[-0.05452485 -0.14933898 2.2738338 ... 0.2...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,"[[0.09625992, -0.18804407, 2.2306604, 0.070576...",3,1,"[[0.09625992, -0.18804407, 2.2306604, 0.070576..."
4,3.0,True,"09 5, 2014",A3VCAL31C8EBRM,615179088,KG,There is a lot of great information in here bu...,review,,"[ 'software ' , 'education & reference ' , 'la...",...,Education & Reference,0.008,19.97,3.994,[[-0.01089804 -0.38519934 2.1462338 ... 0.0...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,"[[0.09625992, -0.18804407, 2.2306604, 0.070576...",0,2,"[[0.09625992, -0.18804407, 2.2306604, 0.070576..."


In [217]:
review_meta.description.head()

0    [ ' < div > human japanese software window pc ...
1    [ ' < div > human japanese software window pc ...
2    [ ' < div > human japanese software window pc ...
3    [ ' < div > human japanese software window pc ...
4    [ ' < div > human japanese software window pc ...
Name: description, dtype: object

In [51]:
software_data = pd.read_csv("../../data/softwares_only.csv")
software_data.drop(columns=['Unnamed: 0'],inplace=True)
software_data.head()

Unnamed: 0,title,asin,description,price,Licensing_Fee,Implemention_cost,Maintenance_cost,postive_score,negative_score,number_reviews,description_embeddings,title_embeddings
0,& amp ; 's : lost formula,B00079VRTW,[ 'the lost formula put kid driver\ 's seat en...,55.32,0.704,27.66,5.532,4.763885,2.767745,8,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.091202 -0.16784424 2.6535537 ... 0.3...
1,& amp ; defrag 16 professional edition [ downl...,B00EIRVO7O,[ 'by combing file fragements scattered across...,29.95,0.0,14.975,2.995,0.000407,0.996757,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.091202 -0.16784424 2.6535537 ... 0.3...
2,& amp ; diskimage 7 professional edition [ dow...,B00EIRWNZG,[ backup restoration made easy . - even comput...,29.95,0.0,14.975,2.995,0.016863,1.990709,2,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.091202 -0.16784424 2.6535537 ... 0.3...
3,& amp ; partitionmanager 2 professional edition,B001HN6J7C,[ ' & partitionmanager advanced tool allows sp...,24.03,2.904,12.015,2.403,0.716291,0.024232,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.091202 -0.16784424 2.6535537 ... 0.3...
4,& amp ; rescuebox 5 personal edition '' / >,B001HN6J7M,[ 'the new & rescuebox 5 data recovery bundle ...,155.5,2.904,77.75,15.55,0.057977,0.994954,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 9.1201998e-02 -1.6784424e-01 2.6535537e+00...


In [103]:
def ranking_algol(df):
    df['rank_score'] = (df['postive_score'] - df['negative_score']) * df['number_reviews']
    df = df.sort_values(by='rank_score', ascending=False)
    return df

In [2]:
def price_ranking(max_price, min_price, max_license_price, min_license_price, max_maintenance_price, min_maintenance_price, ranked_data):
    msg = []
    ranked_price_data = ranked_data[(ranked_data.price >= min_price) & (ranked_data.price <= max_price)]
    if len(ranked_price_data) >= 1:
        ranked_data = ranked_price_data
        # put logging here
    else:
        msg.append("The data available does not have price between the range you specified")

    ranked_license_data = ranked_data[(ranked_data.Licensing_Fee >= min_license_price) & (ranked_data.Licensing_Fee <= max_license_price)]
    if len(ranked_license_data) >= 1:
        ranked_data = ranked_license_data
        # put logging here
    else:
        msg.append("The data available does not have license fee between the range you specified after the price filtering.")

    ranked_maintenance_data = ranked_data[(ranked_data.Licensing_Fee >= min_license_price) & (ranked_data.Licensing_Fee <= max_license_price)]
    if len(ranked_maintenance_data) >= 1:
        ranked_data = ranked_maintenance_data
        # put logging here
    else:
        msg.append("The data available does not have the maintenance fee between the range you specified after license fee filtering")

    return ranked_data , msg




In [6]:

def rec_softwares(software_description, software_data, model_name, max_price = np.inf, min_price = -1, max_license_price = np.inf, min_license_price = -1, max_maintenance_price = np.inf, min_maintenance_price = -1):
    software_data['software_description'] = software_description
    
    if model_name == "TfidfVectorizer":
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(software_data[['software_description','description']])
        cosine_sim = cosine_similarity(X)
    else:
        try:
            model = SentenceTransformer(model_name)
            X_1 = model.encode(software_data['software_description'])
            X_2 = model.encode(software_data['description'])
            cosine_sim = cosine_similarity(X_1, X_2)
        except Exception as e:
            raise f"{model_name} not a transformer model or TfidVectorizer"

    target_item_index = 0 
    scores = list(enumerate(cosine_sim[target_item_index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_n = 10
    top_recommendations = scores[0:top_n+1]
    index_list = []
    for ind, score in top_recommendations:
        index_list.append(ind)
    ranked_data =  ranking_algol(software_data.iloc[index_list,:])
    price_ranked_data, msg = price_ranking(max_price, min_price, max_license_price, min_license_price, max_maintenance_price, min_maintenance_price, ranked_data)

    if msg:
        for m in msg:
            print(m)
    
    if len(price_ranked_data) > 2:
        return price_ranked_data
    else:
        return price_ranked_data


## Experiments

#### Using TfidVectorizer

In [105]:
data = rec_softwares("HR program for windows and mac for japanese humans",software_data,"TfidfVectorizer")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_score'] = (df['postive_score'] - df['negative_score']) * df['number_reviews']


In [106]:
data

Unnamed: 0,title,asin,description,price,Licensing_Fee,Implemention_cost,Maintenance_cost,postive_score,negative_score,number_reviews,description_embeddings,title_embeddings,software_description,rank_score
0,& amp ; 's : lost formula,B00079VRTW,[ 'the lost formula put kid driver\ 's seat en...,55.32,0.704,27.66,5.532,4.763885,2.767745,8,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.091202 -0.16784424 2.6535537 ... 0.3...,HR program for windows and mac for japanese hu...,15.969123
1,& amp ; defrag 16 professional edition [ downl...,B00EIRVO7O,[ 'by combing file fragements scattered across...,29.95,0.0,14.975,2.995,0.000407,0.996757,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.091202 -0.16784424 2.6535537 ... 0.3...,HR program for windows and mac for japanese hu...,-0.996349


#### Recommending based on LLM

In [107]:
model_name = 'sentence-transformers/all-mpnet-base-v2'
rec_softwares("HR program for windows and mac for japanese humans",software_data,model_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_score'] = (df['postive_score'] - df['negative_score']) * df['number_reviews']


Unnamed: 0,title,asin,description,price,Licensing_Fee,Implemention_cost,Maintenance_cost,postive_score,negative_score,number_reviews,description_embeddings,title_embeddings,software_description,rank_score
1280,human japanese,0615179088,[ ' human japanese software window pc present ...,39.94,0.008,19.97,3.994,99.37643,21.094307,118,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,HR program for windows and mac for japanese hu...,9237.29057
934,encore office suite 2.0,B01ABRMTDC,[ 'an easy aordable alternative popular oce pr...,14.27,0.008,7.135,1.427,12.15173,5.451792,20,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.36129296 -0.40343276 2.3724406 ... 0.1...,HR program for windows and mac for japanese hu...,133.99875
1769,lotus smartsuite 97 lotus 1-2-3 5 wordpro 97 a...,B000IPALBA,[ 'this smartsuite give six hardest-working ea...,19.95,0.008,9.975,1.995,11.99212,7.40076,21,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,96.418549
1340,instant immersion asia deployment pack,B0006B07LA,[ 'topics entertainment instant immersion asia...,19.71,0.008,9.855,1.971,4.467621,2.963026,7,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,10.532172
1367,instant immersion japanese v2.0,B0009X6QDI,"[ 'with instant immersion japanese 2.0 easy , ...",8.99,0.008,4.495,0.899,2.813225,0.678475,4,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,8.539
1281,human japanese intermediate,061591067X,[ 'learn japanese long-awaited sequel human ja...,9.98,0.808,4.99,0.998,2.247274,1.078231,4,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.02126986 -0.17556542 2.0080867 ... 0.1...,HR program for windows and mac for japanese hu...,4.676171
3044,staff file 7.0,B0031KQAJQ,[ staff file perfect solution today manager ne...,217.8,0.008,108.9,21.78,0.998256,0.000421,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.05452474 -0.14933896 2.2738335 ... 0.2...,HR program for windows and mac for japanese hu...,0.997835
1369,instant immersion japanese-audio,B000IVFTIE,[ 'meetings mito ? holiday hakone ? kobe kyoto...,9.99,0.704,4.995,0.999,0.997034,0.000135,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,0.996899
1633,learn japanese - level 2 : absolute beginner a...,B004RL6HIW,[ 'introducing ultimate language learning appl...,9.99,0.0,4.995,0.999,0.993137,0.032995,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,0.960143
1634,learn korean - complete audio course mac [ dow...,B004WJ24ZO,[ 'introducing ultimate language learning appl...,49.99,0.0,24.995,4.999,1.005474,1.013237,2,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,-0.015527


In [110]:
model_name = 'sentence-transformers/paraphrase-albert-small-v2'
rec_softwares("HR program for windows and mac for japanese humans",software_data,model_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_score'] = (df['postive_score'] - df['negative_score']) * df['number_reviews']


Unnamed: 0,title,asin,description,price,Licensing_Fee,Implemention_cost,Maintenance_cost,postive_score,negative_score,number_reviews,description_embeddings,title_embeddings,software_description,rank_score
3409,video pro x2,B003EYU5BI,[ 'magix video pro x2 highly sophisticated vid...,49.99,2.4,24.995,4.999,8.797528,3.269671,13,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.07057133 -0.44443575 2.5772188 ... 0.3...,HR program for windows and mac for japanese hu...,71.862144
1340,instant immersion asia deployment pack,B0006B07LA,[ 'topics entertainment instant immersion asia...,19.71,0.008,9.855,1.971,4.467621,2.963026,7,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,10.532172
1367,instant immersion japanese v2.0,B0009X6QDI,"[ 'with instant immersion japanese 2.0 easy , ...",8.99,0.008,4.495,0.899,2.813225,0.678475,4,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,8.539
307,berlitz chinese & amp ; japanese premier ( win...,B000MFNUBE,[ learn chinese & japanese easy way straightfo...,24.98,0.008,12.49,2.498,3.452474,3.01352,6,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.07393144 -0.05106744 2.1081572 ... 0.6...,HR program for windows and mac for japanese hu...,2.633725
1369,instant immersion japanese-audio,B000IVFTIE,[ 'meetings mito ? holiday hakone ? kobe kyoto...,9.99,0.704,4.995,0.999,0.997034,0.000135,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,0.996899
1633,learn japanese - level 2 : absolute beginner a...,B004RL6HIW,[ 'introducing ultimate language learning appl...,9.99,0.0,4.995,0.999,0.993137,0.032995,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,0.960143
1454,japanese crash course,B000OZI9UY,[ 'designed pinpoint relevant information need...,9.99,0.008,4.995,0.999,0.95721,0.115697,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.2410044 -0.33483353 1.7280155 ... 0.2...,HR program for windows and mac for japanese hu...,0.841513
767,"dragon mac 5.0 , upgrade 4.0 [ download ] [ do...",B014JJSX9W,"[ , dragon mac 5.0 speech recognition software...",150.0,0.008,75.0,15.0,0.608842,1.006341,2,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.02883262 -0.36379305 2.3379571 ... 0.1...,HR program for windows and mac for japanese hu...,-0.794999
1632,learn japanese - level 1 : introduction audio ...,B006IYNDLK,[ 'introducing ultimate language learning appl...,9.99,0.0,4.995,0.999,0.006429,0.998245,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,-0.991817
1368,instant immersion japanese v2.0 [ old version ],B0008EWS6W,"[ instant immersion japanese deluxe , 'll get ...",5.95,0.008,2.975,0.595,1.636647,1.990145,4,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,-1.413994


In [109]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
rec_softwares("HR program for windows and mac for japanese humans",software_data,model_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_score'] = (df['postive_score'] - df['negative_score']) * df['number_reviews']


Unnamed: 0,title,asin,description,price,Licensing_Fee,Implemention_cost,Maintenance_cost,postive_score,negative_score,number_reviews,description_embeddings,title_embeddings,software_description,rank_score
1280,human japanese,0615179088,[ ' human japanese software window pc present ...,39.94,0.008,19.97,3.994,99.37643,21.094307,118,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.02127007 -0.17556548 2.008087 ... 0.1...,HR program for windows and mac for japanese hu...,9237.29057
1340,instant immersion asia deployment pack,B0006B07LA,[ 'topics entertainment instant immersion asia...,19.71,0.008,9.855,1.971,4.467621,2.963026,7,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,10.532172
1367,instant immersion japanese v2.0,B0009X6QDI,"[ 'with instant immersion japanese 2.0 easy , ...",8.99,0.008,4.495,0.899,2.813225,0.678475,4,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,8.539
307,berlitz chinese & amp ; japanese premier ( win...,B000MFNUBE,[ learn chinese & japanese easy way straightfo...,24.98,0.008,12.49,2.498,3.452474,3.01352,6,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.07393144 -0.05106744 2.1081572 ... 0.6...,HR program for windows and mac for japanese hu...,2.633725
1369,instant immersion japanese-audio,B000IVFTIE,[ 'meetings mito ? holiday hakone ? kobe kyoto...,9.99,0.704,4.995,0.999,0.997034,0.000135,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,0.996899
1633,learn japanese - level 2 : absolute beginner a...,B004RL6HIW,[ 'introducing ultimate language learning appl...,9.99,0.0,4.995,0.999,0.993137,0.032995,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,0.960143
1454,japanese crash course,B000OZI9UY,[ 'designed pinpoint relevant information need...,9.99,0.008,4.995,0.999,0.95721,0.115697,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.2410044 -0.33483353 1.7280155 ... 0.2...,HR program for windows and mac for japanese hu...,0.841513
1632,learn japanese - level 1 : introduction audio ...,B006IYNDLK,[ 'introducing ultimate language learning appl...,9.99,0.0,4.995,0.999,0.006429,0.998245,1,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[-0.21905278 -0.8611253 1.9966933 ... -0.0...,HR program for windows and mac for japanese hu...,-0.991817
1364,instant immersion japanese deluxe v2.0,B0009X6QFQ,[ 'your passport language fluency ! product in...,9.99,0.008,4.995,0.999,0.845489,1.492663,2,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,-1.29435
1368,instant immersion japanese v2.0 [ old version ],B0008EWS6W,"[ instant immersion japanese deluxe , 'll get ...",5.95,0.008,2.975,0.595,1.636647,1.990145,4,[[ 0.09625992 -0.18804407 2.2306604 ... 0.4...,[[ 0.04209602 -0.11055798 2.5161953 ... 0.3...,HR program for windows and mac for japanese hu...,-1.413994
