In [1]:
import pandas as pd
from transformers import pipeline
import numpy as np
from langdetect import detect

# conda/mamba install tqdm
from tqdm import tqdm
# to use progress bar / progress_apply() instead of apply()
tqdm.pandas()

In [2]:
reviews_df = pd.read_csv(
    "munich_reviews.csv.gz", parse_dates=["date"], index_col="listing_id"
)

In [None]:
# Language detection

def detect_language(review):
    """
    Identifies Language of a single Review, returns missing Value when Identification is not possible
    """

    try:
        language = detect(review)
    except:
        language = pd.NA
    return language

# SUBSECTION: Detect Languages of all Reviews
# takes about 20 minutes on my cpu
language = reviews_df["comments"].progress_apply(detect_language)

language.to_pickle(path="munich_review_languages.pkl")

In [3]:
language = pd.read_pickle("munich_review_languages.pkl")

In [4]:
reviews_df["language"] = language

In [5]:
language.value_counts(dropna=False)

en       67439
de       31538
fr        2785
es        2123
ko        1140
it        1039
ru         993
zh-cn      887
pt         505
nl         414
<NA>       329
af         277
ca         225
ro         219
ja         194
no         192
da         190
so         160
zh-tw      124
id         115
cs          93
pl          86
sv          54
hu          51
fi          49
tr          45
cy          43
vi          36
et          35
tl          34
ar          27
sk          24
el          21
hr          18
sw          12
sl          11
uk          10
bg           9
he           8
mk           5
lt           5
lv           5
sq           4
th           3
hi           1
ur           1
fa           1
Name: comments, dtype: int64

In [6]:
sentiment_analyizer = pipeline("sentiment-analysis")
sentiment_df = pd.DataFrame(
    columns=['listing_id', 'id', 'comment', 'label', 'score', 'language']
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [None]:
# decide if comment is positive or negative and save in new data frame
n = len(reviews_df["comments"])
for i in range(n):
    comment = reviews_df["comments"].iloc[i]

    try:
        result = sentiment_analyizer(comment)[0]
    except:
        label = "None"
        score = "None"
    else:
        label = result['label']
        score = result['score']

    new_row = {'listing_id': reviews_df["comments"].index[i], 'id': reviews_df["id"].iloc[i], 'comment': comment, 'label': label, 'score': score, 'language': reviews_df["language"].iloc[i]}
    sentiment_df = sentiment_df.append(new_row, ignore_index=True)

    if i % 100 == 0:
        print(i)

sentiment_df = sentiment_df.set_index('listing_id')

sentiment_df.to_pickle(path="munich_reviews_sentiment.pkl")

In [7]:
sentiment_df = pd.read_pickle("munich_reviews_sentiment.pkl")

In [9]:
# SUBSECTION: Drop reviews with unrecognized languages (e.g. only one character long) and add Summary Statistics

language = pd.read_pickle("munich_review_languages.pkl")

reviews_features = pd.DataFrame(
    data={"language": language, "review_length": reviews_df["comments"].str.len(), "sentiment": sentiment_df["label"]}
).reset_index()

reviews_features = (
    reviews_features.dropna(subset=["language"])
    .groupby("listing_id")
    .agg(
        number_reviews=("language", lambda x: x.size),
        median_review_length=("review_length", lambda x: np.median(x)),
        number_languages=("language", lambda x: x.nunique()),
        frac_english=("language", lambda x: (x == "en").mean()),
        frac_german=("language", lambda x: (x == "de").mean()),
        language_list=("language", lambda x: x.unique()),
        frac_negative=("sentiment", lambda x: (x == "NEGATIVE").mean())
    )
)

In [10]:
reviews_features.head()

Unnamed: 0_level_0,number_reviews,median_review_length,number_languages,frac_english,frac_german,language_list,frac_negative
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
97945,129,281.0,7,0.666667,0.271318,"[en, fr, de, ru, it, ko, hr]",0.333333
114695,53,228.0,5,0.773585,0.150943,"[en, de, it, fr, es]",0.226415
127383,101,206.0,8,0.60396,0.267327,"[en, de, nl, ru, es, zh-cn, it, fr]",0.316832
159634,34,243.5,4,0.705882,0.235294,"[en, de, ru, fr]",0.323529
170154,496,300.5,14,0.66129,0.205645,"[en, de, es, fr, ru, nl, zh-cn, it, zh-tw, ko,...",0.280242


In [11]:
reviews_features.to_pickle(path="munich_reviews_features.pkl")