### test 

this documents contains the scripts used to generate the pdfs from the csv dataset of movie lens

In [1]:
#### all the impororts needed ####
import pandas as pd
import os
from fpdf import FPDF
from math import ceil
from datetime import datetime
import random
import re

In [2]:
%pip install fpdf

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\farah\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


"Preping" the data (just merging the csvs into one)

In [3]:
path = "data/movielens/"
users = pd.read_csv(path+"users.csv", sep=";")
ratings = pd.read_csv(path+"ratings.csv", sep=";")
movies = pd.read_csv(path+"movies.csv", sep=";", encoding="latin")

'''
The data is from the MovieLens dataset, which contains user ratings for movies.'''
for df in [users, ratings, movies]:
    print(df.head())
    print("--------------------------------")

data = pd.merge(pd.merge(ratings, users), movies)
data.drop('Unnamed: 3', axis=1, inplace=True) #because it is empty

data.head()

   userId gender  age  occupation zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7     2460
4       5      M   25          20    55455
--------------------------------
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
--------------------------------
   movieId                               title                        genres  \
0        1                    Toy Story (1995)   Animation|Children's|Comedy   
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2        3             Grumpier Old Men (1995)                Comedy|Romance   
3        4            Waiting to Exhale (1995)                  Comedy|Drama   
4        5  Father of the Bride Part II (1995)  

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [None]:
data.to_csv("data/MLdata_merged.csv", index=False) # putting the merged data into a csv file for later use

In [5]:
output_dir = "pdf_exports_groupedby_movie"
os.makedirs(output_dir, exist_ok=True)  # create a directory to store the PDFs

In [6]:
def format_timestamp(ts):
    return datetime.utcfromtimestamp(int(ts)).strftime('%B %d, %Y')

In [7]:
age_dict = {
    1: "under 18", 18: "between 18 and 24", 25: "between 25 and 34",
    35: "between 35 and 44", 45: "between 45 and 49",
    50: "between 50 and 55", 56: "56 or older"
}

occupation_dict = {
    0: "unspecified occupation", 1: "academic or educator", 2: "artist", 3: "clerical or administrative worker",
    4: "college or graduate student", 5: "customer service representative", 6: "healthcare professional",
    7: "executive or managerial worker", 8: "farmer", 9: "homemaker", 10: "K-12 student",
    11: "lawyer", 12: "programmer", 13: "retired individual", 14: "sales or marketing professional",
    15: "scientist", 16: "self-employed worker", 17: "technician or engineer",
    18: "tradesperson or craftsman", 19: "unemployed individual", 20: "writer"
}

In [8]:
def sanitize_filename(title):
    return re.sub(r'[^\w\s-]', '', title).replace(' ', '_')

In [9]:

def make_sentence(row):
    date_str = format_timestamp(row["timestamp"])
    gender_str = "male" if row["gender"] == "M" else "female"
    age_range = age_dict.get(int(row["age"]), "unknown age range")
    occupation = occupation_dict.get(int(row["occupation"]), "unspecified occupation")
    #zip_code = row["zip-code"]
    rating = row["rating"]

    templates = [
        f"On {date_str}, a {gender_str} user aged {age_range}, working as a {occupation}, gave a rating of {rating}.",
        f"A {gender_str} viewer aged {age_range}, employed as a {occupation}, evaluated the film with a rating of {rating} on {date_str}.",
        f"The movie received a rating of {rating} on {date_str} from a {gender_str} user in the {age_range} age group, working as a {occupation}.",
        f"On {date_str}, a user identifying as a {gender_str} and categorized under {age_range} age range gave this title a rating of {rating}."
    ]
    return random.choice(templates)


## Generating the pdf files 
this takes around 7min i suggest you don't do it if you already retrived the ones i already processed

In [10]:
def generate_all_movie_pdfs(df, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    movie_titles = df["title"].unique()
    for title in movie_titles:
        movie_df = df[df["title"] == title]

        if movie_df.empty:
            continue

        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", 'B', 16)
        pdf.multi_cell(0, 10, f"Research Report: Ratings for \"{title}\"", align="C")

        pdf.set_font("Arial", '', 12)
        genres = movie_df.iloc[0]["genres"].replace('|', ', ')
        pdf.ln(5)
        pdf.multi_cell(0, 10, f"Genres: {genres}")
        pdf.multi_cell(0, 10, "Below is a collection of individual user ratings drawn from the MovieLens 2000 dataset:\n")

        for _, row in movie_df.iterrows():
            sentence = make_sentence(row)
            pdf.multi_cell(0, 10, f"- {sentence}\n")

        safe_title = sanitize_filename(title)
        pdf_path = os.path.join(output_dir, f"{safe_title}.pdf")
        pdf.output(pdf_path)

    print(f"PDFs generated for {len(movie_titles)} movies in '{output_dir}'.")



In [11]:
#it takes around 4 min to generate all the PDFs
generate_all_movie_pdfs(data, output_dir)

PDFs generated for 3688 movies in 'pdf_exports_groupedby_movie'.


## Grouping the data :
orgenizing it into pair of {file_path, csv}

In [12]:
grouped = data.groupby('title')
print("Number of movies:", len(grouped))
grouped.head()

Number of movies: 3688


Unnamed: 0,userId,movieId,rating,timestamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy
...,...,...,...,...,...,...,...,...,...,...
993739,6001,138,1,956807238,F,25,7,94117,"Neon Bible, The (1995)",Drama
996467,6016,3245,5,994453507,M,45,1,37209,I Am Cuba (Soy Cuba/Ya Kuba) (1964),Drama
996637,6016,3336,3,995663888,M,45,1,37209,It Happened Here (1961),Drama
997451,6024,3443,4,956749779,M,25,12,53705,Born American (1986),Action|Drama|Thriller


In [13]:
# List all generated PDF files in the output directory
output_dir = "pdf_exports_groupedby_movie"
pdf_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith('.pdf')]

# Create a mapping from sanitized movie title to original movie title
sanitized_to_title = {sanitize_filename(title): title for title in data['title'].unique()}

# Match each PDF path with the corresponding movie title DataFrame
matched_pdf_title_dfs = []
for pdf_path in pdf_files:
    pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
    movie_title = sanitized_to_title.get(pdf_filename)
    if movie_title:
        movie_df = data[data['title'] == movie_title]
        matched_pdf_title_dfs.append((pdf_path, movie_df))

print(f"Matched {len(matched_pdf_title_dfs)} PDFs with their corresponding movie DataFrames.")

Matched 3687 PDFs with their corresponding movie DataFrames.


In [14]:
for pdf_path, movie_df in matched_pdf_title_dfs[:15]:
    print(f"PDF Path: {pdf_path}")
    print(movie_df.head())
    print("-" * 40)

PDF Path: pdf_exports_groupedby_movie\1-900_1994.pdf
        userId  movieId  rating  timestamp gender  age  occupation zip-code  \
490061    3015      889       3  975263628      M   56           6    62707   
626214    3790      889       2  966019187      F   25          17    94618   

               title   genres  
490061  1-900 (1994)  Romance  
626214  1-900 (1994)  Romance  
----------------------------------------
PDF Path: pdf_exports_groupedby_movie\1000000_Duck_1971.pdf
        userId  movieId  rating  timestamp gender  age  occupation zip-code  \
32396      216     2031       2  976867230      M   45          13    52761   
73210      494     2031       5  976215651      F   35           0    17870   
109519     714     2031       4  975782711      M   18           4    76013   
134719     869     2031       1  999376619      M   18          20    92026   
162426    1034     2031       3  975093319      F   35           1    82601   

                         title       