In [None]:
%matplotlib inline
import pandas as pd
import os
import csv 
import sys
import seaborn
import matplotlib.pyplot as plt

sys.path.append(os.path.join("..", "src"))
import utils
import config 


path = os.path.join(config.DIR_PROCESSED, "reviews.pkl")
df = pd.read_pickle(path)

print(len(df))
print(df.columns)

not_parsable = df[~df["parsed_success"]]
print("Not parsable:", len(not_parsable))
df = utils.default_df_filter(df)

print(len(df))
def unify_title(x):
    x = str(x)
    if x == "Bachelor" or x == "Magister" or x == "Diplom":
        return "Master"
    if x == "Unbekannt":
        return "Kein Titel"
    if x == "No-Title":
        return "Kein Titel"
    return x

def unity_price_unit(x):
    if x == "EUR,":
        return "EUR"
    return x

print(df["category"].value_counts())

category_map = {
    "AH": "Altenhilfe",
    "BH": "Behindertenhilfe",
    "GW": "Gesundheitswesen",
    "JH": "Jugendhilfe",
    "SM": "Sozialmanagement",
    "MG": "Methoden & Grundlagen",
    "SP": "Sozialpolitik und \n-verwaltung", # TODO??
    "SO": "Weitere Arbeitsfelder" # Sonstige? 
}


def preprocess_category(x):
    if x in category_map:
        return category_map[x]
    elif x:
        return x
    return "Unkategorisiert"



df["reviewer_gender"] = df["reviewer_gender"].apply(lambda x: str(x))
df["reviewer_highest_title"] = df["reviewer_highest_title"].apply(unify_title)
df["price_unit"] = df["price_unit"].apply(unity_price_unit)
df["category"] = df["category"].apply(preprocess_category)

seaborn.set()



In [None]:
df[df["reviewer_id"] != 34346]

In [None]:
top_n_keywords = df.explode("keywords")["keywords"].value_counts()[:20].plot(kind="barh", title="Keywords")
#plt.barh(top_n_keywords.index, top_n_keywords.values)
#plt.
plt.xlabel("#")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "top-keywords.png"), dpi=300)
plt.show()

In [None]:
top_n_reviewer = df["reviewer_name"].value_counts()[:20]
plt.barh(top_n_reviewer.index, top_n_reviewer.values)
plt.title("Top Reviewers")
plt.xlabel("# Reviews")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "top-reviewers.png"), dpi=300)
plt.show()

In [None]:
df["pages"].plot(kind="hist", range=(0,1500), bins=30)
plt.xlabel("# Seiten")
plt.ylabel("Häufigkeit")
plt.title("Seitenanzahl rezensierter Bücher")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "hist-pages.png"), dpi=300)
plt.show()

In [None]:
df[~pd.isnull(df["price"])]["price"].plot(kind="hist", bins=20, range=(0,100))
plt.xlabel("Preis (€)")
plt.ylabel("Häufigkeit")
plt.title("Preis rezensierter Bücher")
plt.savefig(os.path.join(config.DIR_REPORT, "hist-price.png"), dpi=300)
plt.show()

In [None]:
explode = [0.05,0.05,0.05,0.1,0.15,0.2,0.2,0.3]

_ = df["category"].value_counts().plot(kind="pie", title="Themenbereiche", explode=explode)
plt.ylabel("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "categories.pgf"))
plt.show()

In [None]:
_ = df["publisher"].value_counts()[:20].plot(kind="barh", figsize=(10,10))
plt.title("Top Publisher")
plt.xlabel("# Reviews")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "publisher.png"), dpi=300)
plt.show()

In [None]:
df2 = df.set_index("date")


df2.resample("1Y")["word_count"].mean().plot(kind="area")
plt.title("Durchschnittliche Rezensionslänge")
plt.xlabel("")
plt.ylabel("Wörter")
plt.savefig(os.path.join(config.DIR_REPORT, "review-length-ot.png"), dpi=300)
plt.show()


df2.resample("1Y")["id"].count().plot(kind="area")
plt.title("Veröffentliche Rezensionen (pro Jahr)")
plt.xlabel("")
plt.ylabel("Anzahl")
plt.savefig(os.path.join(config.DIR_REPORT, "published-reviews-ot.png"), dpi=300)
plt.show()

df2[df2["price_unit"] == "EUR"].resample("1Y")["price"].mean().plot(kind="area")
plt.title("Durchschnittlicher Preis rezensierter Publikationen")
plt.xlabel("")
plt.ylabel("Preis in €")
plt.savefig(os.path.join(config.DIR_REPORT, "proce-ot.png"), dpi=300)
plt.show()

df2.resample("1Y")["pages"].mean().plot(kind="area")
plt.title("Durchschnittlicher Umfang rezensierter Publikationen")
plt.xlabel("")
plt.ylabel("Seiten")
plt.savefig(os.path.join(config.DIR_REPORT, "pages-ot.png"), dpi=300)
plt.show()

In [None]:
df2.resample("1Y")["word_count"].mean().head(n=100)

In [None]:
fig, axs = plt.subplots(1,2, sharey=True, figsize=(10,5))
df["reviewer_gender"].value_counts().plot(kind="pie", ax=axs[0],  fontsize=15, autopct='%1.1f%%',)

df["reviewer_highest_title"].value_counts().sort_index().plot(kind="pie", ax=axs[1], colors=["C1", "C3", "C2", "C0"], fontsize=15, autopct='%1.1f%%')
axs[0].set_ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "title-and-gender.png"), bbox_inches='tight', dpi=300)
plt.show()

print(df["reviewer_gender"].value_counts(normalize=True))

In [None]:
for date, group in df2["reviewer_gender"].groupby(by=[df2.index.year]):
    print(date)
    group.value_counts().plot(kind="pie", title=date, figsize=(5,5))
    plt.ylabel('')
    plt.show()
    
# mydf.head().value_counts()

In [None]:
_ = df.plot(kind="scatter", x="word_count", y="price", marker="x")
plt.show()

In [None]:
# print(df["price_unit"].unique())
df_reviewers = df[df["price_unit"] == "EUR"].set_index("reviewer_name")
_ = df_reviewers.groupby(by="reviewer_name")["price"].mean().sort_values(ascending=False)[:20].plot(kind="barh", figsize=(10,10))

In [None]:
print(df[df["reviewer_gender"] == "Weiblich"]["word_count"].mean())
print(df[df["reviewer_gender"] == "Männlich"]["word_count"].mean())



df["word_count"].plot(kind="hist", bins=30, range=(0,4500))
plt.xlabel("Wortanzahl")
plt.ylabel("Häufigkeit")
plt.savefig(os.path.join(config.DIR_REPORT, "word_count.png"), dpi=300)
plt.savefig(os.path.join(config.DIR_REPORT, "word_count.pgf"), dpi=300)
plt.show()

df[df["reviewer_gender"] == "Weiblich"]["word_count"].plot(kind="hist", bins=30, range=(0,4500), title="Weiblich")
plt.xlabel("Wortanzahl")
plt.show()

df[df["reviewer_gender"] == "Männlich"]["word_count"].plot(kind="hist", bins=30, range=(0,4500), title="Männlich")
plt.xlabel("Wortanzahl")
plt.show()

In [None]:
# colors = df["reviewer_highest_title"].apply(colormap)
ax = df[df["reviewer_highest_title"] == "Professor"].plot(kind="scatter", x="pages", y="word_count", marker="x", figsize=(15,15), xlim=(0,3000), ylim=(0,5000), c="C0", label="Peofessor")
df[df["reviewer_highest_title"] == "Doktor"].plot(kind="scatter", x="pages", y="word_count", marker="x", figsize=(15,15), xlim=(0,3000), ylim=(0,5000),  ax=ax, c="C1", label="Doktor")
df[df["reviewer_highest_title"] == "Unbekannt"].plot(kind="scatter", x="pages", y="word_count", marker="x", ax=ax, figsize=(15,15), xlim=(0,3000), ylim=(0,5000), c="C3", label="Unbekannt")
df[df["reviewer_highest_title"] == "Master"].plot(kind="scatter", x="pages", y="word_count", marker="x", figsize=(15,15), xlim=(0,3000), ylim=(0,5000),  ax=ax, c="C2", label="Master")
plt.savefig(os.path.join(config.DIR_REPORT, "wordcount-vs-pages.png"), dpi=300)
plt.show()

In [None]:
print(df[(df["reviewer_gender"] == "Weiblich") & (df["price_unit"] == "EUR")]["price"].mean())
print(df[(df["reviewer_gender"] == "Weiblich") & (df["price_unit"] == "EUR")]["price"].mean())
# print(df[df["reviewer_gender"] == "Männlich"][df["price_unit"] == "EUR"]["price"].mean())

In [None]:

ax = df[(df["reviewer_highest_title"] == "Professor") & (df["price_unit"] == "EUR")].plot(kind="scatter", x="pages", y="price", marker="x", figsize=(15,15), xlim=(0,800), ylim=(0,150), c="C0", label="Peofessor")
ax = df[(df["reviewer_highest_title"] == "Doktor") & (df["price_unit"] == "EUR")].plot(kind="scatter", x="pages", y="price", marker="x", ax=ax, figsize=(15,15), xlim=(0,800), ylim=(0,150), c="C1", label="Doktor")
ax = df[(df["reviewer_highest_title"] == "Unbekannt") & (df["price_unit"] == "EUR")].plot(kind="scatter", x="pages", y="price", marker="x", ax=ax, figsize=(15,15), xlim=(0,800), ylim=(0,150), c="C2", label="Unbekannt")
ax = df[(df["reviewer_highest_title"] == "Master") & (df["price_unit"] == "EUR")].plot(kind="scatter", x="pages", y="price", marker="x", ax=ax, figsize=(15,15), xlim=(0,800), ylim=(0,150), c="C3", label="Master")
plt.show()

#df[df["reviewer_highest_title"] == "Doktor"][df["price_unit"] == "EUR"].plot(kind="scatter", x="pages", y="word_count", marker="x", figsize=(15,15), xlim=(0,3000), ylim=(0,5000),  ax=ax, c="C1", label="Doktor")
#df[df["reviewer_highest_title"] == "Diplom"][df["price_unit"] == "EUR"].plot(kind="scatter", x="pages", y="word_count", marker="x", figsize=(15,15), xlim=(0,3000), ylim=(0,5000),  ax=ax, c="C2", label="Diplom")


#color=df[df["price_unit"] == "EUR"]["reviewer_highest_title"]
#df[df["price_unit"] == "EUR"].plot(kind="scatter", x="pages", y="price", marker="x", color=color, figsize=(15,15), xlim=(0,1000), ylim=(0,150))

# [df["price"] == "EUR"]

In [None]:
df.sort_values(by="price", ascending=False)[["title", "price", "pages", "isbn", "price_unit"]].head(10)

In [None]:
def process_text(x):
    x = x.lower()
    for c in ".!?,–\"',-()[]{}„…":
        x = x.replace(c, "")
     
    x = [v for v in x.split() if len(v) > 1]
    
    return x

# df["text"].head(20)

words = df["text"].apply(process_text).explode().value_counts()

In [None]:
words[:50].plot(kind="bar", figsize=(10,10))
plt.title("Häufigkeit von Wörter in Reviews")
plt.xlabel("Wort")
plt.ylabel("Häufigkeit")

In [None]:
import numpy as np
df.groupby(by="reviewer_name")["pages"].sum().sort_values(ascending=False)[:10].plot(kind="barh")
plt.ylabel("")
plt.xticks([0,100000, 200000, 300000,400000], labels=["0", "100k", "200k", "300k", "400k"])
plt.title("Akkumulierte Seitenanzahl")
plt.xlabel("# Seiten")
plt.savefig(os.path.join(config.DIR_REPORT, "accumulated-pages.png"), dpi=300)
plt.show()

In [None]:
import numpy as np
df.groupby(by="reviewer_name")["pages"].sum().sort_values(ascending=False)[:10].apply(lambda x: (x / (20*356))).plot(kind="barh")
plt.ylabel("")
#plt.xticks([0,100000, 200000, 300000,400000])
plt.title("Durschnittliche Seitenanzahl (20 Jahre)")
plt.xlabel("Seiten / Tag")
plt.savefig(os.path.join(config.DIR_REPORT, "mean-pages.png"), dpi=300)
plt.show()

In [None]:
df.boxplot(by='reviewer_gender',  column="word_count", showfliers=False)
plt.title("Länge der Rezensionen")
plt.xlabel("")
plt.ylabel("Wörter")
plt.suptitle("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-review-length-gender.jpg"), dpi=300)
plt.show()

df.boxplot(by='reviewer_highest_title', column="word_count", showfliers=False)
plt.title("Länge der Rezensionen")
plt.xlabel("")
plt.ylabel("Wörter")
plt.suptitle("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-review-length-title.jpg"), dpi=300)
plt.show()

In [None]:
n = len(df[df["price_unit"] == "EUR"])

df[df["price_unit"] == "EUR"].boxplot(by='reviewer_gender', column="price",showfliers=False)
plt.title(f"Preis rezensierter Publikationen ({n})")
plt.xlabel("")
plt.ylabel("Preis in €")
plt.suptitle("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-gender.jpg"), dpi=300)
plt.show()

df[df["price_unit"] == "EUR"].boxplot(by='reviewer_highest_title', column="price",showfliers=False)
plt.title("Preis rezensierter Publikationen")
plt.ylabel("Preis in €")
plt.xlabel("")
plt.suptitle("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-price-title.jpg"), dpi=300)
plt.show()

In [None]:
df.boxplot(by='reviewer_gender',  column="pages", showfliers=False)
plt.title("Umfang rezensierter Publikationen")
plt.xlabel("")
plt.ylabel("Seiten")
plt.suptitle("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-pagesh-gender.jpg"), dpi=300)
plt.show()

df.boxplot(by='reviewer_highest_title', column="pages", showfliers=False)
plt.title("Umfang rezensierter Publikationen")
plt.xlabel("")
plt.ylabel("Seiten")
plt.suptitle("")
plt.tight_layout()
plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-pages-title.jpg"), dpi=300)
plt.show()

In [None]:
#df2 = pd.DataFrame()

#ax  = df[df["reviewer_gender"] == "Männlich"].groupby("reviewer_id")["id"].count().plot(kind="box", showfliers=False)
#df2["Weiblich"] = df[df["reviewer_gender"] == "Weiblich"].groupby("reviewer_id")["id"].count().plot(kind="box", ax=ax, showfliers=False)
#df2["Unbekannt"] = df[df["reviewer_gender"] == "Unbekannt"].groupby("reviewer_id")["id"].count()


# df.boxplot(by='reviewer_gender',  column="word_count", showfliers=False)

#df.boxplot(by='reviewer_name',  column="word_count", showfliers=False)

# df.resample("1Y")["id"].count().plot(kind="area")
# plt.title("Veröffentliche Rezensionen (pro Jahr)")
# plt.xlabel("")
# plt.ylabel("Anzahl")
# plt.savefig(os.path.join(config.DIR_REPORT, "published-reviews-ot.png"), dpi=300)
# plt.show()



# df.boxplot(by='reviewer_gender',  column="word_count", showfliers=False)
# plt.title("Länge der Rezensionen")
# plt.xlabel("")
# plt.ylabel("Wörter")
# plt.suptitle("")
# plt.tight_layout()
# plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-review-length-gender.jpg"), dpi=300)
# plt.show()

# df.boxplot(by='reviewer_highest_title', column="word_count", showfliers=False)
# plt.title("Länge der Rezensionen")
# plt.xlabel("")
# plt.ylabel("Wörter")
# plt.suptitle("")
# plt.tight_layout()
# plt.savefig(os.path.join(config.DIR_REPORT, "boxplots-review-length-title.jpg"), dpi=300)
# plt.show()

In [None]:
print(df[df["price_unit"] == "EUR"].groupby(by="reviewer_gender")[["price"]].mean())
print(df[df["price_unit"] == "EUR"].groupby(by="reviewer_gender")[["price"]].std())
print(df[df["price_unit"] == "EUR"].groupby(by="reviewer_gender")[["price"]].median())

In [None]:
# .plot(kind="box")

# colors=["C0", "C1", "C3", "C2"], labels=["Professor", "Doktor", "Unbekannt", "Master/\nDiplom/\nMagister"])
    
colors=["C1", "C3", "C2", "C0"]


for n, g in df.groupby(by="reviewer_gender")["reviewer_highest_title"]:
    print(g.value_counts(normalize=True).sort_index())
    

    ## 
    
    g.value_counts(normalize=True, sort=False).sort_index().plot(kind="pie", colors=["C1", "C3", "C2", "C0"])
    plt.title(n)
    plt.ylabel("")
    plt.show()
    # val.plot(kind="bar")

In [None]:
df.explode("keywords")["keywords"].value_counts().head(n=10)

In [None]:
n_reviews = len(df)
print(n_reviews)
exploded = df.explode("keywords")
keywords = exploded["keywords"].unique()
n_keywords = len(keywords)
print(n_keywords)

for keyword in keywords[:5]:
    idxs = exploded["keywords"] == keyword
    sel = exploded[idxs]
    c = sel[["reviewer_gender", "keywords"]].groupby(by="reviewer_gender").count()
    
    for p in c.values:
        print(p)
    #print(c)
    #print(keyword, len(sel))

In [None]:
n_reviews = len(df)
print(n_reviews)
exploded = df.explode("keywords")
keywords = exploded["keywords"].unique()
n_keywords = len(keywords)
print(n_keywords)

for keyword in keywords[:5]:
    idxs = exploded["keywords"] == keyword
    sel = exploded[idxs]
    c = sel[["reviewer_gender", "keywords"]].groupby(by="reviewer_gender").count()
    
    for p in c.values:
        print(p)
    #print(c)
    #print(keyword, len(sel))