In [29]:
import json
import pandas as pd
from pathlib import Path
from scipy.stats import ks_2samp

import plotly.express as px
import plotly.io as pio
pio.kaleido.scope.default_format = "png"
pio.kaleido.scope.default_width = "800"
pio.kaleido.scope.default_height = "400"
pio.kaleido.scope.default_scale = "10"

In [30]:
# 'S_overall': 1=PASS, 2=WARN, 3=FAIL
with open("results.json", "r") as f:
    data = json.load(f)
print(f'Categories           Percent of product having high fake-review (%)         Number of products')
print('-'*102)
for key, val in data.items():
    empty = val['']/sum(val.values())*100
    if val.get('-1'):
        del val['-1']
    del val['']
    total = sum(val.values())
    fake_rate = val['3']/total*100
    print(f"{key:<30}\t\t{fake_rate:.2f} %\t\t\t\t\t{total}\t\t\t\t{empty:.2f}%")

Categories           Percent of product having high fake-review (%)         Number of products
------------------------------------------------------------------------------------------------------
Pet_Supplies                  		12.06 %					4421				34.95%
Appliances                    		14.46 %					484				39.27%
Home_and_Kitchen              		10.91 %					15353				37.44%
Movies_and_TV                 		17.27 %					1841				78.14%
Software                      		20.55 %					253				52.89%
Automotive                    		11.04 %					5425				38.10%
Cell_Phones_and_Accessories   		8.98 %					4910				56.91%
AMAZON_FASHION                		5.35 %					243				59.70%
Electronics                   		9.88 %					15531				32.72%
Books                         		28.15 %					14809				75.32%


In [31]:
df = pd.DataFrame(data)
df = df/df.sum(axis=0)
df = df.T
df = df.reindex(sorted(df.columns), axis=1)
df = df.reindex(sorted(df.index), axis=0)
df.columns = ["Real reviews", "Potential fake reviews", "Fake reviews"]
df

Unnamed: 0,Real reviews,Potential fake reviews,Fake reviews
AMAZON_FASHION,0.683128,0.263374,0.053498
Appliances,0.619835,0.235537,0.144628
Automotive,0.672627,0.216959,0.110415
Books,0.424674,0.293875,0.28145
Cell_Phones_and_Accessories,0.572912,0.337271,0.089817
Electronics,0.660292,0.240937,0.09877
Home_and_Kitchen,0.666319,0.224582,0.109099
Movies_and_TV,0.51711,0.310158,0.172732
Pet_Supplies,0.66727,0.212169,0.120561
Software,0.280632,0.513834,0.205534


In [32]:
fig = px.bar(df, x=df.index, y=df.columns, 
                 title="Distribution of fake versus real reviews",
                 labels={"value":"Percentage",
                         "index":"Categories",
                         "variable":"Classes"})
fig.show()
fig.to_image(format="png", engine="kaleido")
fig.write_image("percent.png")

In [48]:
out = Path("images").joinpath("rate_score")
out.mkdir(parents=True, exist_ok=True)
df_total = pd.DataFrame(columns = ['rating','s_overall','count'])
for file in Path("results").glob("*"):
    with open(file, "r") as f:
        df_temp = json.load(f)
    df_temp = pd.DataFrame(df_temp).T
    df_temp = df_temp[['rating','s_overall','count']]
    df_temp['rating'] = pd.to_numeric(df_temp['rating'], errors='coerce')
    df_temp['s_overall'] = pd.to_numeric(df_temp['s_overall'], errors='coerce')
    df_temp['count'] = pd.to_numeric(df_temp['count'], errors='coerce')
    df_temp = df_temp.dropna()
    df_temp = df_temp.loc[df_temp['s_overall'] > 0]
    ks_test = ks_2samp(df_temp[df_temp['s_overall']==1]['rating'], df_temp[df_temp['s_overall']==3]['rating'])
    name = ' '.join(file.stem.split('-')[0].split('_'))
    fig = px.box(df_temp, x="s_overall", y="rating",
                        title=f"Distribution rating of different classes in {name} reviews <br>Kolmogorov–Smirnov test between real and fake reviews: p={ks_test.pvalue:.1E}",
                        labels={"s_overall":"Classes"})
    fig.update_layout(
        xaxis = dict(
            tickvals = [1, 2, 3],
            ticktext = ['Real reviews', 'Potential fake reviews', 'Fake reviews']
        )
    )
    fig.to_image(format="png", engine="kaleido")

    fig.write_image(out.joinpath(f"{file.stem.split('.')[0]}.png"))
    df_total = pd.concat([df_total, df_temp], ignore_index=True)

In [49]:
ks_test = ks_2samp(df_total[df_total['s_overall']==1]['rating'], df_total[df_total['s_overall']==3]['rating'])
fig = px.box(df_total, x="s_overall", y="rating",
                    title=f"Distribution rating of different classes in total reviews <br>Kolmogorov–Smirnov test between real and fake reviews: p={ks_test.pvalue:.1E}",
                    labels={"s_overall":"Classes"})
fig.update_layout(
    xaxis = dict(
        tickvals = [1, 2, 3],
        ticktext = ['Real reviews', 'Potential fake reviews', 'Fake reviews']
    )
)
fig.to_image(format="png", engine="kaleido")
fig.write_image(out.joinpath("total.png"))