# Convert Movies' Plots to Vectors

In [1]:
import cohere
import time
import pandas as pd

# get free Trial API Key at https://cohere.ai/
from cred import API_key

co = cohere.Client(API_key)

In [2]:
def convert_text_2_vect(co, texts, model="embed-english-light-v2.0"):
    """Convert multiple text strings to vectors."

    Parameters
    ----------
    co : cohere.Client
        co = cohere.Client(API_key)
    texts : list of strings
        texts = [text1, text2, text3, text4, text5, text6...]
    model : str, optional
        Dimension of output vector, by default "embed-english-light-v2.0"
        "embed-english-light-v2.0" = 1024 dim
        "embed-english-v2.0" = 4096 dim

    Returns
    -------
    list of vectors
        [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], ... ]
    """
    response = co.embed(model=model, texts=texts)
    # print('Embeddings: {}'.format(response.embeddings))
    return response.embeddings

## Load Data

Original Dataset: https://www.kaggle.com/datasets/gabrieltardochi/wikipedia-movie-plots-with-plot-summaries

To make it easier, I changed the original names of columns.

In [4]:
movies_filename = "../../input/wiki_movie_plots/wiki_movie_plots_deduped_with_summaries.csv"
df_full = pd.read_csv(movies_filename)
df_full.columns

Index(['year', 'title', 'Origin/Ethnicity', 'director', 'cast', 'genre',
       'wiki_link', 'plot', 'plot_summary'],
      dtype='object')

The names of columns can be changed with Pandas after reading the data from csv file.

```python
df.rename(columns={"Title": "title", "Old_name":"new_name"}, inplace=True)
```

In [5]:
df_full.head()

Unnamed: 0,year,title,Origin/Ethnicity,director,cast,genre,wiki_link,plot,plot_summary
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",Carrie Nation and her followers burst into a s...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,The first shot is set in a wood during winter ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,The earliest known adaptation of the classic f...


## Check Data

First we check the quality of the data for empty/missing data.

By looking at the top frequent items, we can see what repeats.

In [8]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              34886 non-null  int64 
 1   title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   director          34886 non-null  object
 4   cast              33464 non-null  object
 5   genre             34886 non-null  object
 6   wiki_link         34886 non-null  object
 7   plot              34886 non-null  object
 8   plot_summary      34886 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.4+ MB


In [9]:
print(df_full.director.mode()) # top frequent items
print(df_full.genre.mode())
print(df_full.cast.mode())

0    Unknown
Name: director, dtype: object
0    unknown
Name: genre, dtype: object
0    Tom and Jerry
Name: cast, dtype: object


In [12]:
# Quick view a plot of a movie
text = df_full.loc[df_full["title"] == "The Dark Knight"].iloc[0].plot_summary
text

"A gang of criminals rob a Gotham City mob bank, murdering each other until only the mastermind remains: the Joker. Batman, District Attorney Harvey Dent and Lieutenant Jim Gordon form an alliance to rid Gotham of organized crime. Bruce Wayne believes that, with Dent as Gotham's protector, he can retire from being Batman and lead a normal life with Rachel Dawes."

The data is quite behind. The latest movies was released in 2017.

In [13]:
df_full["year"].max()

2017

## Create a new DataFrame

In [14]:
df = df_full[["year", 'title', 'director', 'cast', "genre", 'plot', "plot_summary", 'wiki_link']].copy()

## Create new two columns to store the vectors

plot_vector_1024: Vectorized of the full plot in 1024 dimension (a vector of 1024 float numbers)

plot_summary_vector_1024: Vectorized of the summarized plot in 1024 dimension (a vector of 1024 float numbers)

In [15]:
df["plot_vector_1024"] = None
df["plot_summary_vector_1024"] = None
df["plot_vector_1024"] = df["plot_vector_1024"].astype(object)
df["plot_summary_vector_1024"] = df["plot_summary_vector_1024"].astype(object)

Now let's convert the texts of plot to vectors using API provided by CO.HERE AI.

To make the plots more meaningful, I added title, director, genre before the plot. The format is:
```
The title of Film/Movie is 'Great Gasby' in 2017 by director Antoine, genre is commedy, romantic. Long before ... < plot >
```

**Note**: The free API is limited. For embedded actions, there are 100 calls per minute. Therefore, there is ```time.sleep(1)``` to slow down the process.

Because the API supports multiple texts to convert at once, I packed them into a batch.


In [76]:
BATCH_SIZE = 100 # batch size
i = 0
n = len(df)
while i < n:
    print(i, n)
    plot = []
    plot_summary = []
    for idx in range(i, min(n, i+BATCH_SIZE)):
        row = df.iloc[idx]
        text = f"The title of Film/Movie is '{row.title}' in {row.year}"
        if row.director != "Unknown":
            text += f" by director {row.director}"
        if row.genre != "unknown":
            text += f", genre is {row.genre}"
        text += ". "
            
        plot.append(text + row["plot"])
        plot_summary.append(text + row.plot_summary)
        
    plot_1024 = convert_text_2_vect(plot,"embed-english-light-v2.0")
    plot_summary_1024 = convert_text_2_vect(plot_summary,"embed-english-light-v2.0")
    
    for idx in range(i, min(n, i+BATCH_SIZE)):
        df.at[idx, "plot_vector_1024"] = plot_1024[idx-i]
        df.at[idx, "plot_summary_vector_1024"] = plot_summary_1024[idx-i]
    
    i += BATCH_SIZE
    time.sleep(1) # not call Co-here API to fast

0 34886
100 34886
200 34886
300 34886
400 34886
500 34886
600 34886
700 34886
800 34886
900 34886
1000 34886
1100 34886
1200 34886
1300 34886
1400 34886
1500 34886
1600 34886
1700 34886
1800 34886
1900 34886
2000 34886
2100 34886
2200 34886
2300 34886
2400 34886
2500 34886
2600 34886
2700 34886
2800 34886
2900 34886
3000 34886
3100 34886
3200 34886
3300 34886
3400 34886
3500 34886
3600 34886
3700 34886
3800 34886
3900 34886
4000 34886
4100 34886
4200 34886
4300 34886
4400 34886
4500 34886
4600 34886
4700 34886
4800 34886
4900 34886
5000 34886
5100 34886
5200 34886
5300 34886
5400 34886
5500 34886
5600 34886
5700 34886
5800 34886
5900 34886
6000 34886
6100 34886
6200 34886
6300 34886
6400 34886
6500 34886
6600 34886
6700 34886
6800 34886
6900 34886
7000 34886
7100 34886
7200 34886
7300 34886
7400 34886
7500 34886
7600 34886
7700 34886
7800 34886
7900 34886
8000 34886
8100 34886
8200 34886
8300 34886
8400 34886
8500 34886
8600 34886
8700 34886
8800 34886
8900 34886
9000 34886
9100 34886


## Save processed results to CSV file

In [78]:
new_csv_file = "movies_35K_embedded_vector.csv"
df.to_csv(new_csv_file, index=False, encoding='utf-8')

In [28]:
test = pd.read_csv("movies_embedded_vector.csv")
#test = pd.read_csv("movies_embedded_vector.csv", converters={"item_vector":ast.literal_eval})
#test.imbd_votes = test.imbd_votes.astype(str)
#test.imbd_votes=test.imbd_votes.apply(lambda cell: cell.replace(",",""))
#test.imbd_votes = test.imbd_votes.astype(int)
#test.to_csv("movies_embedded_vector.csv", index=False, encoding='utf-8')

# Save to Cassandra

To do Vector Search, I use the Cassandra offerred by DataStax. This gives a free 5GB storage space on Google cloud platform.

To authorize, we need to download the secure connect bundle first. The guide is here https://docs.datastax.com/en/astra-serverless/docs/connect/secure-connect-bundle.html

After that, create Application tokens: https://docs.datastax.com/en/astra-serverless/docs/manage/org/manage-tokens.html

Save this information to `cred.py` following the template in `cred-template.py`.


In [None]:
from cassandra import ConsistencyLevel
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import Cluster

from cred import (ASTRA_CLIENT_ID, ASTRA_CLIENT_SECRET,
                  SECURE_CONNECT_BUNDLE_PATH)

In [80]:
KEYSPACE_NAME = "demo"
TABLE_NAME = "movies_35K_vectorized"

cloud_config = {"secure_connect_bundle": SECURE_CONNECT_BUNDLE_PATH}
auth_provider = PlainTextAuthProvider(ASTRA_CLIENT_ID, ASTRA_CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider, protocol_version=4)
session = cluster.connect()
session.execute(f"USE {KEYSPACE_NAME};")

## Prepare Table and Indexes

The vector columns must be indexed to fast calculate ANN (Approximate Nearest Neighbor).

Based on the input which is a point in space (which has dimension equal to the dim of the vector), the Cassandra will calculate the distances from given point to data in database and return the nearest neighbors using ANN search.

In [91]:
# "year", 'title', 'director', 'cast', "genre", 'plot', "plot_summary", 'wiki_link', plot_vector_1024, plot_summary_vector_1024
table_create_query = f"""
CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
year int,
title text,
director text,
cast text,
genre text,
plot text,
plot_summary text,
wiki_link text,
plot_vector_1024 VECTOR<FLOAT, 1024>, 
plot_summary_vector_1024 VECTOR<FLOAT, 1024>, 
PRIMARY KEY (year, title)
);
"""
session.execute(table_create_query)

create_index_query = f"""
CREATE CUSTOM INDEX IF NOT EXISTS ann_plot_vector_1024 ON 
{TABLE_NAME}(plot_vector_1024) USING 'StorageAttachedIndex';
"""
session.execute(create_index_query)

create_index_query = f"""
CREATE CUSTOM INDEX IF NOT EXISTS ann_plot_summary_vector_1024 ON 
{TABLE_NAME}(plot_summary_vector_1024) USING 'StorageAttachedIndex';
"""
session.execute(create_index_query)

<cassandra.cluster.ResultSet at 0x24b5f80a100>

## Insert to database

In [105]:
rows = tuple(df.itertuples(index=False, name=None))
for row in rows:
    session.execute(
                    f"""
INSERT INTO {TABLE_NAME} (year, title, director, cast, genre,
 plot, plot_summary, wiki_link, plot_vector_1024, plot_summary_vector_1024) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""",
                    row,
                )

The insertion can be speeded up by using Batch processing or upload CSV file in CQL. The demonstration here https://github.com/linhhlp/Big-Data-and-Machine-Learning

In [15]:
# Close connection to Cassandra
cluster.shutdown()