In [1]:
import pandas as pd

df = pd.read_csv('movies.csv')

In [2]:
df.head()

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,526896,Morbius,Action-Science Fiction-Fantasy,en,Dangerously ill with a rare blood disorder and...,12854.042,Columbia Pictures-Marvel Entertainment,2022-03-30,75000000.0,161000000.0,104.0,Released,A new Marvel legend arrives.,6.3,1105.0,Jared Leto-Matt Smith-Adria Arjona-Jared Harri...,vampire-based on comic-sony's spider-man unive...,/6JjfSchsU6daXk2AKX8EEBjO3Fm.jpg,/gG9fTyDL03fiKnOpf2tr01sncnt.jpg,675353-763285-338953-752623-629542-811596-5456...
1,752623,The Lost City,Action-Adventure-Comedy-Romance,en,A reclusive romance novelist was sure nothing ...,11818.362,Paramount-Fortis Films-3dot Productions-Exhibi...,2022-03-24,74000000.0,164289828.0,112.0,Released,The adventure is real. The heroes are not.,6.8,746.0,Sandra Bullock-Channing Tatum-Daniel Radcliffe...,duringcreditsstinger,/neMZH82Stu91d3iqvLdNQfqPPyl.jpg,/A3bsT0m1um6tvcmlIGxBwx9eAxn.jpg,338953-526896-675353-763285-545611-751237-6399...
2,675353,Sonic the Hedgehog 2,Action-Science Fiction-Comedy-Family-Adventure,en,After settling in Green Hills Sonic is eager t...,8162.111,SEGA-Original Film-Blur Studios-Marza Animatio...,2022-03-30,110000000.0,365000000.0,122.0,Released,Welcome to the next level.,7.8,1606.0,Ben Schwartz-Idris Elba-Colleen O'Shaughnessey...,sequel-based on video game-hedgehog-live actio...,/6DrHO1jr3qVrViUO6s6kFiAGM7.jpg,/egoyMDLqCxzjnSrWOz50uLlJWmD.jpg,526896-338953-676705-763285-629542-752623-4067...
3,639933,The Northman,Action-Adventure-Fantasy-Thriller,en,Prince Amleth is on the verge of becoming a ma...,8011.091,New Regency Pictures-Focus Features-Perfect Wo...,2022-04-07,70000000.0,63542000.0,137.0,Released,Conquer your fate.,7.5,1080.0,Alexander Skarsgård-Nicole Kidman-Claes Bang-E...,sword-father murder-prince-iceland-viking-nort...,/zhLKlUaF1SEpO58ppHIAyENkwgw.jpg,/cqnVuxXe6vA7wfNWubak3x36DKJ.jpg,338953-545611-648579-752623-632617-675353-7399...
4,335787,Uncharted,Action-Adventure,en,A young street-smart Nathan Drake and his wise...,6209.995,Columbia Pictures-Atlas Entertainment-PlayStat...,2022-02-10,120000000.0,400780000.0,116.0,Released,Fortune favors the bold.,7.2,2088.0,Tom Holland-Mark Wahlberg-Sophia Ali-Tati Gabr...,treasure-treasure hunt-based on video game-dlb,/tlZpSxYuBRoVJBOpUrPdQe9FmFq.jpg,/aEGiJJP91HsKVTEPy1HhmN0wRLm.jpg,414906-505026-406759-740460-763285-476669-7365...


In [3]:
features = ['id', 'title', 'genres', 'overview', 'credits', 'keywords', 'popularity']

df = df[features]

In [4]:
df.head()

Unnamed: 0,id,title,genres,overview,credits,keywords,popularity
0,526896,Morbius,Action-Science Fiction-Fantasy,Dangerously ill with a rare blood disorder and...,Jared Leto-Matt Smith-Adria Arjona-Jared Harri...,vampire-based on comic-sony's spider-man unive...,12854.042
1,752623,The Lost City,Action-Adventure-Comedy-Romance,A reclusive romance novelist was sure nothing ...,Sandra Bullock-Channing Tatum-Daniel Radcliffe...,duringcreditsstinger,11818.362
2,675353,Sonic the Hedgehog 2,Action-Science Fiction-Comedy-Family-Adventure,After settling in Green Hills Sonic is eager t...,Ben Schwartz-Idris Elba-Colleen O'Shaughnessey...,sequel-based on video game-hedgehog-live actio...,8162.111
3,639933,The Northman,Action-Adventure-Fantasy-Thriller,Prince Amleth is on the verge of becoming a ma...,Alexander Skarsgård-Nicole Kidman-Claes Bang-E...,sword-father murder-prince-iceland-viking-nort...,8011.091
4,335787,Uncharted,Action-Adventure,A young street-smart Nathan Drake and his wise...,Tom Holland-Mark Wahlberg-Sophia Ali-Tati Gabr...,treasure-treasure hunt-based on video game-dlb,6209.995


In [5]:
df.isna().sum()

id                 0
title              1
genres         88859
overview       51714
credits        81923
keywords      281337
popularity         1
dtype: int64

In [6]:
df = df[df['title'].notna() & df['popularity'].notna()]

df = df.fillna('')

In [7]:
df.isna().sum()

id            0
title         0
genres        0
overview      0
credits       0
keywords      0
popularity    0
dtype: int64

In [8]:
df = df.sort_values(by='popularity', ascending=False).set_index('id')

In [9]:
df['word_concat'] = ''

In [10]:
from nltk.stem.snowball import SnowballStemmer


def stem(d):
    for i, movie in d.iterrows():
        word_concat = ''
        stemmer = SnowballStemmer(language='english')
        word_concat += ' '.join([stemmer.stem(word) for word in movie['title'].split()]) + ' '
        word_concat += ' '.join([stemmer.stem(word) for word in movie['overview'].split()]) + ' '
        word_concat += ' '.join([stemmer.stem(word) for word in ' '.join(movie['genres'].split('-')).split()]) + ' '
        word_concat += ' '.join([stemmer.stem(word) for word in ' '.join(movie['credits'].split('-')).split()]) + ' '
        word_concat += ' '.join([stemmer.stem(word) for word in ' '.join(movie['keywords'].split('-')).split()])
        d.at[i, 'word_concat'] = word_concat
    return d

In [11]:
import multiprocessing as mp
import numpy as np

num_processes = mp.cpu_count() - 1
chunks = np.array_split(df, num_processes)

In [12]:
pool = mp.Pool(processes=num_processes)

df = pd.concat(pool.map(stem, chunks))

In [13]:
df

Unnamed: 0_level_0,title,genres,overview,credits,keywords,popularity,word_concat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
526896,Morbius,Action-Science Fiction-Fantasy,Dangerously ill with a rare blood disorder and...,Jared Leto-Matt Smith-Adria Arjona-Jared Harri...,vampire-based on comic-sony's spider-man unive...,12854.042,morbius danger ill with a rare blood disord an...
752623,The Lost City,Action-Adventure-Comedy-Romance,A reclusive romance novelist was sure nothing ...,Sandra Bullock-Channing Tatum-Daniel Radcliffe...,duringcreditsstinger,11818.362,the lost citi a reclus romanc novelist was sur...
675353,Sonic the Hedgehog 2,Action-Science Fiction-Comedy-Family-Adventure,After settling in Green Hills Sonic is eager t...,Ben Schwartz-Idris Elba-Colleen O'Shaughnessey...,sequel-based on video game-hedgehog-live actio...,8162.111,sonic the hedgehog 2 after settl in green hill...
639933,The Northman,Action-Adventure-Fantasy-Thriller,Prince Amleth is on the verge of becoming a ma...,Alexander Skarsgård-Nicole Kidman-Claes Bang-E...,sword-father murder-prince-iceland-viking-nort...,8011.091,the northman princ amleth is on the verg of be...
335787,Uncharted,Action-Adventure,A young street-smart Nathan Drake and his wise...,Tom Holland-Mark Wahlberg-Sophia Ali-Tati Gabr...,treasure-treasure hunt-based on video game-dlb,6209.995,unchart a young street-smart nathan drake and ...
...,...,...,...,...,...,...,...
608462,Theatre Piece,,,,,0.600,theatr piec
600268,Racial Integration,Documentary,A comprehensive view of the situation of diffe...,,,0.600,racial integr a comprehens view of the situat ...
592620,If the Dancer Dances,Documentary,Filmmaker Maia Wechsler follows choreographer ...,,dance-modern dance,0.600,if the dancer danc filmmak maia wechsler follo...
599012,Dear My Love,Drama,Too shy to speak out the word of appreciation ...,Masatoshi Nakamura-Mieko Harada-Keiko Toda-Iss...,,0.600,dear my love too shi to speak out the word of ...


In [16]:
df.to_csv('movies_new.csv')