In [2]:
import pandas as pd
import plotly.express as px


df = pd.read_csv("reviews.csv")


# sørg for riktige typer
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
df["helpful"] = pd.to_numeric(df["helpful"], errors="coerce").fillna(0).astype(int)

# dato: du har både mm/dd og dd/mm i eksempelet, så vi prøver begge
d1 = pd.to_datetime(df["review_date"], errors="coerce", dayfirst=False)
d2 = pd.to_datetime(df["review_date"], errors="coerce", dayfirst=True)
df["review_date"] = d1.fillna(d2)

df.head()


  d2 = pd.to_datetime(df["review_date"], errors="coerce", dayfirst=True)


Unnamed: 0,review_id,rating,review_text,review_date,helpful
0,981e465b-d3ba-4632-9c60-25051efac38a,5,It's good,2025-11-22 01:19:00,0
1,964d3555-9429-4c20-8127-ce3c71ce9273,5,WhatsApp not working well always shows offline...,2025-11-24 20:03:00,0
2,6c28859f-1554-4ca1-9aa8-9d66f204be0a,5,"Oppo not corresponding, share with me the offi...",2025-11-25 06:26:00,0
3,a7efafc3-5871-4020-a398-9cc12cb4072a,5,"Excellent app, great communication super conne...",2025-11-25 18:09:00,0
4,de142b31-a5ad-446f-b7c8-51b264728478,4,simply the ɓest for chat and calls.i love it,2025-11-24 01:10:00,1


In [None]:
fig = px.histogram(df, x="rating", nbins=5, title="Fordeling av rating")
fig.show() #Omtrent halve datasettet har 5 i rating

In [None]:
import numpy as np

upper_limit = df["helpful"].quantile(0.99)
df_filtered = df[df["helpful"] <= upper_limit]

fig = px.violin(df, x="rating", y="helpful", box=True, points=False, 
             title="Helpful Votes per Rating")
fig.update_yaxes(type='log')
fig.show()

# denne funket dårlig, viser opp mot over en million og videre selv om det er kun en som har nærmere 250k stemmer. utenom dette er resten under 10 000
# Tror ikke violin funker godt, må evt begrense parameterene og distansen

In [None]:
import numpy as np

upper_limit = df["helpful"].quantile(0.99)
df_filtered = df[df["helpful"] <= upper_limit]

fig = px.box(df_filtered, x="rating", y="helpful",
             title="Helpful Votes per Rating (Outliers Removed)")
fig.show()

# isolerer ut de med flest stemmer, altså blant annet den som har 250k stemmer

In [None]:
median_helpful = df.groupby("rating")["helpful"].median().reset_index()

fig = px.bar(median_helpful, x="rating", y="helpful",
             title="Median Helpful Votes per Rating")
fig.show()

#prøvde median, men denne ble skjevfordelt og flat på 0. Dette skyldes trolig at over halvparten av helpful dataen er 0

In [None]:
mean_helpful = df.groupby("rating")["helpful"].mean().reset_index()

fig = px.bar(mean_helpful, x="rating", y="helpful",
             title="Average Helpful Votes per Rating")
fig.show()

# viser gjenomsnittet av helpful. Blir dratt opp en del av de med mange stemmer. 
# Bruker mean for å fjerne de som fikk 0 i rating sånn at dataen blir mer representativ

In [5]:
# Sorter etter helpful votes synkende og ta topp 20
top_20_helpful = df.nlargest(20, 'helpful')[['rating', 'review_text', 'helpful']]
print(top_20_helpful)

      rating                                        review_text  helpful
4677       5  You are rolling out the reactions to messages ...   248962
1629       5  Please add an "edit status" feature.Also let's...     8696
5195       5  “I want WhatsApp to release an update in which...     4345
813        5  it's the Best messaging app I've used... it's ...     1884
4642       1  There is a major bug and privacy flaw where mu...     1688
4072       1        where is the scroll option for contact list      305
3115       2  5-star rating! WhatsApp is perfect for staying...      191
4082       3  I've noticed that when I start a new chat with...      188
6082       1  Many New features like Account Centre, Add Fac...      121
723        4  Love the app. But there's one bug that's been ...      113
2933       5  Can you add a feature to choose who can OPEN O...      101
703        2  good night i was using WhatsApp and I was bann...       95
4617       1  the about section character limit is 

In [8]:
# Beregn antall ord (enkel måte)
df['word_count'] = df['review_text'].str.split().str.len()

# Fjern ekstreme outliers for bedre visualisering
word_count_99 = df['word_count'].quantile(0.99)
helpful_99 = df['helpful'].quantile(0.99)

df_visual = df[(df['word_count'] <= word_count_99) & (df['helpful'] <= helpful_99)]

# Del ordantall inn i grupper
df_visual['word_group'] = pd.cut(df_visual['word_count'], 
                                 bins=[0, 10, 20, 30, 50, 75, 100, 200, 500],
                                 labels=['0-10', '11-20', '21-30', '31-50', '51-75', '76-100', '101-200', '201+'])

# Beregn gjennomsnittlig helpful per gruppe
avg_helpful_by_words = df_visual.groupby('word_group', observed=True)['helpful'].mean().reset_index()

# Visualiser med begrenset y-akse
fig = px.bar(avg_helpful_by_words, x='word_group', y='helpful',
             title='Gjennomsnittlig helpful score per ordantall',
             labels={'word_group': 'Antall ord i anmeldelsen', 
                    'helpful': 'Gjennomsnittlig helpful votes'})
fig.update_yaxes(range=[0, avg_helpful_by_words['helpful'].max() * 1.1])  # Begrens y-aksen
fig.show()

# Sjekk statistikken
print(avg_helpful_by_words)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  word_group   helpful
0       0-10  0.174670
1      11-20  0.279919
2      21-30  0.322981
3      31-50  0.580220
4      51-75  0.810484
5     76-100  1.109375
