In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlretrieve

movies_url = 'https://raw.githubusercontent.com/kyriakossk2000/MasterThesis/main/Dataset%20Analysis/ml-1m/movies.dat'
ratings_url = 'https://raw.githubusercontent.com/kyriakossk2000/MasterThesis/main/Dataset%20Analysis/ml-1m/ratings.dat'
users_url = 'https://raw.githubusercontent.com/kyriakossk2000/MasterThesis/main/Dataset%20Analysis/ml-1m/users.dat'

urlretrieve(movies_url, 'movies.dat')
urlretrieve(ratings_url, 'ratings.dat')
urlretrieve(users_url, 'users.dat')

movies = pd.read_csv('movies.dat', sep='::', engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('users.dat', sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

display(movies.head())
display(ratings.head())
display(users.head())

In [None]:
merged_data = pd.merge(pd.merge(ratings, users), movies)
display(merged_data.head())

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(x='Rating', data=merged_data)
plt.title('Distribution of Ratings')
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(x='Age', data=merged_data)
plt.title('Distribution of User Ages')
plt.show()

In [None]:
ratings_per_movie = merged_data.groupby('Title')['Rating'].count().sort_values(ascending=False)
# top 10 movies with the most ratings
display(ratings_per_movie.head(10))

In [None]:
average_rating = merged_data.groupby('Title')['Rating'].mean().sort_values(ascending=False)
# top 10 movies with the highest average rating
display(average_rating.head(10))

In [None]:
# dataframe with the number of ratings and the average rating for each movie
movie_stats = merged_data.groupby('Title').agg({'Rating': ['count', 'mean']})
movie_stats.columns = ['Number of Ratings', 'Average Rating']

popular_movies = movie_stats[movie_stats['Number of Ratings'] > 1000]
popular_movies = popular_movies.sort_values('Average Rating', ascending=False)
# top 10 popular movies with the highest average rating
display(popular_movies.head(10))

In [None]:
genres = merged_data['Genres'].str.get_dummies(sep='|').sum().sort_values(ascending=False)
# distribution of genres
plt.figure(figsize=(10, 4))
sns.barplot(x=genres.values, y=genres.index)
plt.title('Distribution of Genres')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.show()

In [None]:
genres_df = merged_data['Genres'].str.get_dummies(sep='|')
genres_df['Rating'] = merged_data['Rating']
average_rating_per_genre = genres_df.groupby('Rating').mean()

# average rating for each genre
average_rating_per_genre.T.plot(kind='bar', figsize=(10, 4))
plt.title('Average Rating per Genre')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.show()

In [None]:
# timestamp to datetime
merged_data['Datetime'] = pd.to_datetime(merged_data['Timestamp'], unit='s')
merged_data['Date'] = merged_data['Datetime'].dt.date
display(merged_data.head())

In [None]:
ratings_per_day = merged_data.groupby('Date')['Rating'].count()

# number of ratings per day
plt.figure(figsize=(10, 4))
ratings_per_day.plot()
plt.title('Number of Ratings per Day')
plt.xlabel('Date')
plt.ylabel('Number of Ratings')
plt.show()

In [None]:
ratings_per_user = merged_data.groupby('UserID')['Rating'].count().sort_values(ascending=False)

# distribution of the number of ratings per user
plt.figure(figsize=(10, 4))
sns.histplot(ratings_per_user, bins=50, kde=False)
plt.title('Distribution of the Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.show()

In [None]:
ratings_per_movie = merged_data.groupby('Title')['Rating'].count().sort_values(ascending=False)

# distribution of the number of ratings per movie
plt.figure(figsize=(10, 4))
sns.histplot(ratings_per_movie, bins=50, kde=False)
plt.title('Distribution of the Number of Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
average_rating_per_movie = merged_data.groupby('Title')['Rating'].mean().sort_values(ascending=False)
# distribution of the average rating per movie
plt.figure(figsize=(10, 4))
sns.histplot(average_rating_per_movie, bins=50, kde=False)
plt.title('Distribution of the Average Rating per Movie')
plt.xlabel('Average Rating')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
merged_data['Day of Week'] = merged_data['Datetime'].dt.dayofweek  # number of ratings per day of the week
ratings_per_day_of_week = merged_data.groupby('Day of Week')['Rating'].count()

plt.figure(figsize=(10, 4))
ratings_per_day_of_week.plot()
plt.title('Number of Ratings per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Ratings')
plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

ratings = pd.read_csv('ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
ratings['Datetime'] = pd.to_datetime(ratings['Timestamp'], unit='s')

# scatter plot of user activity over time
plt.figure(figsize=(10, 4))
plt.plot_date(ratings['Datetime'], ratings['UserID'], alpha=0.1, markersize=2)
plt.title('User Activity Over Time')
plt.xlabel('Time')
plt.ylabel('User ID')
plt.show()

In [None]:
selected_users = ratings['UserID'].unique()[:10]

for user in selected_users:
    user_ratings = ratings[ratings['UserID'] == user]
    plt.figure(figsize=(10, 4))
    plt.plot_date(user_ratings['Datetime'], user_ratings['Rating'], alpha=0.5, markersize=5)
    plt.title(f'User {user} Activity Over Time')
    plt.xlabel('Time')
    plt.ylabel('Rating')
    plt.show()

In [None]:
ratings['Date'] = ratings['Datetime'].dt.date
sequence_lengths = ratings.groupby('UserID')['Date'].apply(lambda x: (x.max() - x.min()).days)
average_sequence_length = sequence_lengths.mean()
average_sequence_length

In [None]:
num_users = ratings['UserID'].nunique()

num_items = ratings['MovieID'].nunique()

avg_actions_per_user = ratings.groupby('UserID').size().mean()

num_actions = len(ratings)

num_users, num_items, avg_actions_per_user, num_actions

In [None]:
ratings['TimeDiff'] = ratings.groupby('UserID')['Datetime'].diff().dt.total_seconds()
time_diffs = ratings[ratings['TimeDiff'] != 0]['TimeDiff']

# distribution of time intervals
plt.figure(figsize=(10, 4))
plt.hist(time_diffs, bins=100, alpha=0.75)
plt.title('Time Interval Distribution')
plt.xlabel('Time Interval (seconds)')
plt.ylabel('Number')
plt.show()

In [None]:
selected_users = ratings['UserID'].unique()[:10]

# scatter plot of user activity over time for each selected user
for user in selected_users:
    user_ratings = ratings[ratings['UserID'] == user]
    user_ratings = user_ratings.set_index('Datetime').resample('7D').count()['UserID']
    plt.figure(figsize=(10, 4))
    plt.plot_date(user_ratings.index, user_ratings, alpha=0.5, markersize=5)
    plt.title(f'User {user} Activity Over 7-Day Time Window')
    plt.xlabel('Time')
    plt.ylabel('Number of Ratings')
    plt.show()

In [None]:
num_windows_per_user = ratings.groupby('UserID')['Datetime'].apply(lambda x: ((x.max() - x.min()).days // 7) + 1) # number of 7-day windows for each user

avg_windows_per_user = num_windows_per_user.mean() # average number of 7-day windows per user

min_windows_per_user = num_windows_per_user.min() # minimum and maximum number of 7-day windows per user
max_windows_per_user = num_windows_per_user.max() 

# number of users with only one 7-day window
num_users_with_one_window = (num_windows_per_user == 1).sum()

avg_windows_per_user, min_windows_per_user, max_windows_per_user, num_users_with_one_window