In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

In [2]:
df = pd.read_csv("letterboxd.csv")

In [3]:
df["rating"] = df["rating"].str.split("-", expand=True)[1]

In [4]:
df["date_rated"] = df["date_rated"].str[-11:].str[0:10]

In [5]:
df["duration"] = df["duration"].str.split(expand=True)[0]

In [6]:
df["genre"] = df["genre"].str.title()

In [7]:
df["genre"] = df["genre"].str.replace(",", ", ")

In [8]:
df["director"] = df["director"].str.replace(",", ", ")

In [9]:
df['date_rated'] = pd.to_datetime(df.date_rated, format='%Y/%m/%d')

In [10]:
df["decade"] = df["year"].astype(str).str[:3] + "0s"

In [11]:
df[["rating", "duration"]] = df[["rating", "duration"]].astype(int)

In [13]:
df = df[df["director"].notnull()]

In [12]:
df.to_csv("../all_letterboxd.csv")

# Directors

In [13]:
directors = df["director"].str.split(",", expand=True)

In [14]:
directors = directors.unstack().reset_index(name="director")

In [15]:
directors.drop(columns=["level_0", "level_1"], inplace=True)

In [16]:
directors["director"] = directors["director"].str.strip()

In [17]:
directors.dropna(inplace=True)

In [18]:
directors["count"] = 1

In [19]:
directors = pd.DataFrame(directors.groupby("director")["count"].sum())

In [20]:
directors.sort_values(by="count", ascending=False, inplace=True)

In [21]:
directors = directors[:20]

In [22]:
directors.to_csv("../directors.csv")

In [23]:
dir_bar = df.iloc[:,[0, 2, 5]]

In [24]:
dir_bar = dir_bar.assign(director=dir_bar["director"].str.split(",")).explode("director")

In [25]:
dir_bar["director"] = dir_bar["director"].str.strip()

In [26]:
pivot = pd.pivot_table(dir_bar, index=["director"], columns=["title"], margins=True, aggfunc=[np.mean, len])

In [27]:
pivot = pivot.stack("title")

In [28]:
pivot = pivot.reset_index()

In [29]:
pivot = pivot.loc[pivot.director != "All"]

In [30]:
pivot.columns = pivot.columns.droplevel(1)

In [31]:
pivot

Unnamed: 0,director,title,len,mean
0,Abbas Kiarostami,All,3.0,8.000000
1,Abbas Kiarostami,Certified Copy,1.0,8.000000
2,Abbas Kiarostami,Close-Up,1.0,9.000000
3,Abbas Kiarostami,Taste of Cherry,1.0,7.000000
4,Abel Ferrara,All,6.0,6.833333
...,...,...,...,...
1565,Ágnes Hranitzky,Werckmeister Harmonies,1.0,7.000000
1566,Éric Rohmer,All,3.0,8.000000
1567,Éric Rohmer,My Night at Maud's,1.0,8.000000
1568,Éric Rohmer,Pauline at the Beach,1.0,8.000000


In [32]:
pivot.to_csv("../test.csv", index=False)

In [33]:
dir_bar.to_csv("../dir_bar.csv", index=False)

# Decade

In [19]:
decade = df.groupby("decade").agg({"title": "size", "rating": "mean"}).rename(columns={"title": "count", "rating": "avg_rating"})
decade

Unnamed: 0_level_0,count,avg_rating
decade,Unnamed: 1_level_1,Unnamed: 2_level_1
1910s,4,7.25
1920s,23,7.565217
1930s,22,7.272727
1940s,41,7.512195
1950s,78,7.769231
1960s,118,7.618644
1970s,148,7.594595
1980s,118,7.415254
1990s,114,7.517544
2000s,127,7.275591


In [20]:
decade.to_csv("../decade_breakdown.csv")

# Release Year

In [36]:
release_year = df.iloc[:, [0, 1]]

In [37]:
min_year = release_year["year"].min()

In [38]:
max_year = release_year["year"].max()

In [39]:
year_range = np.arange(min_year, max_year + 1)

In [40]:
release_year_group = release_year.groupby(["year"]).count()

In [41]:
release_year_group = release_year_group.reindex(year_range).fillna(0)

In [42]:
release_year_group.to_csv("../release_year.csv")

# Watch date

In [82]:
w_date = df.set_index("date_rated")

In [83]:
w_date = w_date.groupby("date_rated").count()

In [84]:
w_date = pd.DataFrame(w_date.iloc[:, 1])

In [85]:
first_w_date = pd.Timestamp("2015-04-25")
last_w_date = w_date.index.max()

In [86]:
w_date_range = pd.date_range(start=first_w_date, end=last_w_date)

In [87]:
w_date = w_date.reindex(w_date_range).fillna(0)

In [88]:
monthly = w_date.groupby(pd.Grouper(freq="MS")).sum()

In [89]:
monthly.rename(columns={"year": "count"}, inplace=True)

In [90]:
monthly.to_csv("../watch_date.csv", index_label="date")