# 1. Rating Products
# - 1.1 Average
# - 1.2 Time-Based Weighted Average
# - 1.3 User-Based Weighted Average
# - 1.4 Weighted Rating

In [5]:
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("file/")
df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [2]:
df.shape

(4323, 6)

In [3]:
df["Rating"].value_counts()

Rating
5.0    3267
4.5     475
4.0     383
3.5      96
3.0      62
1.0      15
2.0      12
2.5      11
1.5       2
Name: count, dtype: int64

In [4]:
df["Questions Asked"].value_counts()

Questions Asked
0.0     3867
1.0      276
2.0       80
3.0       43
4.0       15
5.0       13
6.0        9
8.0        5
9.0        3
14.0       2
11.0       2
7.0        2
10.0       2
15.0       2
22.0       1
12.0       1
Name: count, dtype: int64

In [6]:
# Rating by users asked questions
df.groupby("Questions Asked").agg({"Questions Asked" : "count",
                          "Rating" : "mean"})

Unnamed: 0_level_0,Questions Asked,Rating
Questions Asked,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3867,4.765193
1.0,276,4.740942
2.0,80,4.80625
3.0,43,4.744186
4.0,15,4.833333
5.0,13,4.653846
6.0,9,5.0
7.0,2,4.75
8.0,5,4.9
9.0,3,5.0


## 1.1 Average

In [8]:
df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [9]:
df["Rating"].mean()

4.764284061993986

#### In Average, we only averaged Rating scores, but we need to include the concept of time because customers may be happy in the first month but not in the other months.


## 1.2 Time-Based Weight Average

In [10]:
df.info()
df["Timestamp"] = pd.to_datetime(df["Timestamp"]) # converting object to datetime64 for "Timestamp"
current_date = pd.to_datetime("2021-02-10 0:0:0")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rating              4323 non-null   float64
 1   Timestamp           4323 non-null   object 
 2   Enrolled            4323 non-null   object 
 3   Progress            4323 non-null   float64
 4   Questions Asked     4323 non-null   float64
 5   Questions Answered  4323 non-null   float64
dtypes: float64(4), object(2)
memory usage: 202.8+ KB


In [11]:
# 1.2.1 Creating "days" columns 
df["Days"] = (current_date - df["Timestamp"]).dt.days
df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,Days
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,4
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,5
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,5
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,5
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,5


#### We're going to create Time-Based Weight Average according to Rating and Days

In [16]:
df.loc[df["Days"] <= 30, "Rating"].mean() * 28 / 100 + \
df.loc[(df["Days"] > 30) & (df["Days"] <= 90), "Rating"].mean() * 26 / 100 + \
df.loc[(df["Days"] > 90) & (df["Days"] <= 180), "Rating"].mean() * 24 / 100 + \
df.loc[df["Days"] < 180, "Rating"].mean() * 22 / 100

4.763547811848779

In [17]:
# Let's create a function 
def time_based_weighted_average (dataframe, w1 = 28, w2 = 26, w3 = 24, w4 = 22) :
    return  df.loc[df["Days"] <= 30, "Rating"].mean() * w1 / 100 + \
            df.loc[(df["Days"] > 30) & (df["Days"] <= 90), "Rating"].mean() * w2 / 100 + \
            df.loc[(df["Days"] > 90) & (df["Days"] <= 180), "Rating"].mean() * w3 / 100 + \
            df.loc[df["Days"] < 180, "Rating"].mean() * w4 / 100
time_based_weighted_average(df,28,26,24,22)

4.763547811848779

## 1.3 User - Based Weighted Average

In [18]:
# Should all users have the same score? NO
df.groupby("Progress").agg({"Rating" : "mean"})

Unnamed: 0_level_0,Rating
Progress,Unnamed: 1_level_1
0.0,4.673913
1.0,4.642691
2.0,4.654762
3.0,4.663551
4.0,4.777328
...,...
94.0,5.000000
95.0,4.794118
97.0,5.000000
98.0,5.000000


#### We're going to create function of User-Based Weight Average according to Course Progress and Rating


In [19]:
def user_based_weighted_average(dataframe, w1 = 22, w2 = 24, w3 = 26, w4 = 28):
    return  df.loc[df["Progress"] <= 10, "Rating"].mean() * w1 / 100 + \
            df.loc[(df["Progress"] > 10) & (df["Progress"] <= 45), "Rating"].mean() * w2 / 100 + \
            df.loc[(df["Progress"] > 45) & (df["Progress"] <= 75), "Rating"].mean() * w3 / 100 + \
            df.loc[(df["Progress"] > 75), "Rating"].mean() * w4 / 100

user_based_weighted_average(df,22,24,26,28)

4.800257704672543

## 1.4 Weighted Rating

#### Let's put these two weightings together and weight them again.


In [20]:
def course_weighted_rating(dataframe, time_w = 50, user_w = 50):
    return time_based_weighted_average(dataframe) * time_w / 100 + user_based_weighted_average(dataframe) * user_w / 100

course_weighted_rating(df,40,60)

4.785573747543037

### Calculates the overall rating of a course by combining two different evaluation criteria:
#### - Time-based evaluation: Based on how recently users rated the course.
#### - User-based evaluation: Based on how much of the course users have completed.

### This approach creates a more balanced and reliable course rating score.