In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlretrieve

# URLs of the datasets
movies_url = 'https://raw.githubusercontent.com/kyriakossk2000/MasterThesis/main/Dataset%20Analysis/ml-1m/movies.dat'
ratings_url = 'https://raw.githubusercontent.com/kyriakossk2000/MasterThesis/main/Dataset%20Analysis/ml-1m/ratings.dat'
users_url = 'https://raw.githubusercontent.com/kyriakossk2000/MasterThesis/main/Dataset%20Analysis/ml-1m/users.dat'

# Download the datasets
urlretrieve(movies_url, 'movies.dat')
urlretrieve(ratings_url, 'ratings.dat')
urlretrieve(users_url, 'users.dat')

# Load the datasets into pandas dataframes
movies = pd.read_csv('movies.dat', sep='::', engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users = pd.read_csv('users.dat', sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

# Display the first few rows of each dataframe
display(movies.head())
display(ratings.head())
display(users.head())

In [None]:
# Merge the datasets
merged_data = pd.merge(pd.merge(ratings, users), movies)

# Display the first few rows of the merged dataframe
display(merged_data.head())

In [None]:
# Analyze the distribution of ratings
plt.figure(figsize=(10, 4))
sns.countplot(x='Rating', data=merged_data)
plt.title('Distribution of Ratings')
plt.show()

In [None]:
# Analyze the distribution of user ages
plt.figure(figsize=(10, 4))
sns.countplot(x='Age', data=merged_data)
plt.title('Distribution of User Ages')
plt.show()

In [None]:
# Analyze the number of ratings per movie
ratings_per_movie = merged_data.groupby('Title')['Rating'].count().sort_values(ascending=False)

# Display the top 10 movies with the most ratings
display(ratings_per_movie.head(10))

In [None]:
# Analyze the average rating per movie
average_rating = merged_data.groupby('Title')['Rating'].mean().sort_values(ascending=False)

# Display the top 10 movies with the highest average rating
display(average_rating.head(10))

In [None]:
# Create a dataframe with the number of ratings and the average rating for each movie
movie_stats = merged_data.groupby('Title').agg({'Rating': ['count', 'mean']})
movie_stats.columns = ['Number of Ratings', 'Average Rating']

# Filter movies that have more than 1000 ratings
popular_movies = movie_stats[movie_stats['Number of Ratings'] > 1000]

# Sort by average rating
popular_movies = popular_movies.sort_values('Average Rating', ascending=False)

# Display the top 10 popular movies with the highest average rating
display(popular_movies.head(10))

In [None]:
# Analyze the distribution of genres
genres = merged_data['Genres'].str.get_dummies(sep='|').sum().sort_values(ascending=False)

# Plot the distribution of genres
plt.figure(figsize=(10, 4))
sns.barplot(x=genres.values, y=genres.index)
plt.title('Distribution of Genres')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.show()

In [None]:
# Create a dataframe with genres as columns
genres_df = merged_data['Genres'].str.get_dummies(sep='|')

# Add the ratings to the dataframe
genres_df['Rating'] = merged_data['Rating']

# Calculate the average rating for each genre
average_rating_per_genre = genres_df.groupby('Rating').mean()

# Plot the average rating for each genre
average_rating_per_genre.T.plot(kind='bar', figsize=(10, 4))
plt.title('Average Rating per Genre')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.show()

In [None]:
# Convert the timestamp to datetime
merged_data['Datetime'] = pd.to_datetime(merged_data['Timestamp'], unit='s')

# Extract the date from the datetime
merged_data['Date'] = merged_data['Datetime'].dt.date

# Display the first few rows of the dataframe
display(merged_data.head())

In [None]:
# Analyze the number of ratings per day
ratings_per_day = merged_data.groupby('Date')['Rating'].count()

# Plot the number of ratings per day
plt.figure(figsize=(10, 4))
ratings_per_day.plot()
plt.title('Number of Ratings per Day')
plt.xlabel('Date')
plt.ylabel('Number of Ratings')
plt.show()

In [None]:
# Analyze the number of ratings per user
ratings_per_user = merged_data.groupby('UserID')['Rating'].count().sort_values(ascending=False)

# Plot the distribution of the number of ratings per user
plt.figure(figsize=(10, 4))
sns.histplot(ratings_per_user, bins=50, kde=False)
plt.title('Distribution of the Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.show()

In [None]:
# Analyze the number of ratings per movie
ratings_per_movie = merged_data.groupby('Title')['Rating'].count().sort_values(ascending=False)

# Plot the distribution of the number of ratings per movie
plt.figure(figsize=(10, 4))
sns.histplot(ratings_per_movie, bins=50, kde=False)
plt.title('Distribution of the Number of Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
# Analyze the average rating per movie
average_rating_per_movie = merged_data.groupby('Title')['Rating'].mean().sort_values(ascending=False)

# Plot the distribution of the average rating per movie
plt.figure(figsize=(10, 4))
sns.histplot(average_rating_per_movie, bins=50, kde=False)
plt.title('Distribution of the Average Rating per Movie')
plt.xlabel('Average Rating')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
# Analyze the number of ratings per day of the week
merged_data['Day of Week'] = merged_data['Datetime'].dt.dayofweek
ratings_per_day_of_week = merged_data.groupby('Day of Week')['Rating'].count()

# Plot the number of ratings per day of the week
plt.figure(figsize=(10, 4))
ratings_per_day_of_week.plot()
plt.title('Number of Ratings per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Ratings')
plt.xticks(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets into pandas dataframes
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')

# Convert the timestamp to datetime
ratings['Datetime'] = pd.to_datetime(ratings['Timestamp'], unit='s')

# Create a scatter plot of user activity over time
plt.figure(figsize=(10, 4))
plt.plot_date(ratings['Datetime'], ratings['UserID'], alpha=0.1, markersize=2)
plt.title('User Activity Over Time')
plt.xlabel('Time')
plt.ylabel('User ID')
plt.show()

In [None]:
# Select 10 users
selected_users = ratings['UserID'].unique()[:10]

# Create a scatter plot of user activity over time for each selected user
for user in selected_users:
    user_ratings = ratings[ratings['UserID'] == user]
    plt.figure(figsize=(10, 4))
    plt.plot_date(user_ratings['Datetime'], user_ratings['Rating'], alpha=0.5, markersize=5)
    plt.title(f'User {user} Activity Over Time')
    plt.xlabel('Time')
    plt.ylabel('Rating')
    plt.show()

In [None]:
# Calculate the average sequence length (in days) for each user
ratings['Date'] = ratings['Datetime'].dt.date
sequence_lengths = ratings.groupby('UserID')['Date'].apply(lambda x: (x.max() - x.min()).days)
average_sequence_length = sequence_lengths.mean()
average_sequence_length

In [None]:
# Calculate the number of users
num_users = ratings['UserID'].nunique()

# Calculate the number of items
num_items = ratings['MovieID'].nunique()

# Calculate the average number of actions per user
avg_actions_per_user = ratings.groupby('UserID').size().mean()

# Calculate the total number of actions
num_actions = len(ratings)

num_users, num_items, avg_actions_per_user, num_actions

In [None]:
# Calculate the time intervals between consecutive actions for each user
ratings['TimeDiff'] = ratings.groupby('UserID')['Datetime'].diff().dt.total_seconds()

# Exclude time intervals of zero
time_diffs = ratings[ratings['TimeDiff'] != 0]['TimeDiff']

# Plot the distribution of time intervals
plt.figure(figsize=(10, 4))
plt.hist(time_diffs, bins=100, alpha=0.75)
plt.title('Time Interval Distribution')
plt.xlabel('Time Interval (seconds)')
plt.ylabel('Number')
plt.show()