# Cleaning IMDb dataset for NETS 1500 project
The following code cleans the relatively small dataset and unwinds lists into atomic records.

In [None]:
import pandas as pd
import numpy as np

In [None]:
prefix = '/content/drive'
from google.colab import drive
drive.mount(prefix, force_remount=True)

Mounted at /content/drive


In [None]:
imdb_path = '/content/drive/My Drive/imdb_dataset.csv'

In [None]:
imdb_df = pd.read_csv(imdb_path)

In [None]:
imdb_df.head()

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
0,Kantara,2022,Rishab Shetty,"Rishab Shetty, Sapthami Gowda, Kishore Kumar G...",9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
1,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",9.0,152,UA,$534.86M,Action,"Crime, Drama"
2,The Lord of the Rings: The Return of the King,2003,Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen, Or...",9.0,201,U,$377.85M,Action,"Adventure, Drama"
3,Inception,2010,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",8.8,148,UA,$292.58M,Action,"Adventure, Sci-Fi"
4,The Lord of the Rings: The Two Towers,2002,Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Or...",8.8,179,UA,$342.55M,Action,"Adventure, Drama"


In [None]:
# Transform string of actors into list of actors
imdb_df['Actors'] = imdb_df['Actors'].str.split(", ")
imdb_df

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
0,Kantara,2022,Rishab Shetty,"[Rishab Shetty, Sapthami Gowda, Kishore Kumar ...",9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
1,The Dark Knight,2008,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",9.0,152,UA,$534.86M,Action,"Crime, Drama"
2,The Lord of the Rings: The Return of the King,2003,Peter Jackson,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",9.0,201,U,$377.85M,Action,"Adventure, Drama"
3,Inception,2010,Christopher Nolan,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elli...",8.8,148,UA,$292.58M,Action,"Adventure, Sci-Fi"
4,The Lord of the Rings: The Two Towers,2002,Peter Jackson,"[Elijah Wood, Ian McKellen, Viggo Mortensen, O...",8.8,179,UA,$342.55M,Action,"Adventure, Drama"
...,...,...,...,...,...,...,...,...,...,...
5557,Disaster Movie,2008,"Directors:Jason Friedberg, Aaron Seltzer","[Carmen Electra, Vanessa Lachey, Nicole Parker...",1.9,87,PG-13,$14.19M,Comedy,Sci-Fi
5558,The Hottie & the Nottie,2008,Tom Putnam,"[Paris Hilton, Joel David Moore, Christine Lak...",1.9,91,PG-13,$0.03M,Comedy,Romance
5559,From Justin to Kelly,2003,Robert Iscove,"[Kelly Clarkson, Justin Guarini, Katherine Bai...",1.9,81,PG,$4.92M,Comedy,"Musical, Romance"
5560,Superbabies: Baby Geniuses 2,2004,Bob Clark,"[Jon Voight, Scott Baio, Vanessa Angel, Skyler...",1.5,88,PG,$9.11M,Comedy,"Family, Sci-Fi"


In [None]:
# Expand list of actors
imdb_df = imdb_df.explode('Actors').reset_index(drop=True)
imdb_df

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
0,Kantara,2022,Rishab Shetty,Rishab Shetty,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
1,Kantara,2022,Rishab Shetty,Sapthami Gowda,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
2,Kantara,2022,Rishab Shetty,Kishore Kumar G.,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
3,Kantara,2022,Rishab Shetty,Achyuth Kumar,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
4,The Dark Knight,2008,Christopher Nolan,Christian Bale,9.0,152,UA,$534.86M,Action,"Crime, Drama"
...,...,...,...,...,...,...,...,...,...,...
22231,Superbabies: Baby Geniuses 2,2004,Bob Clark,Skyler Shaye,1.5,88,PG,$9.11M,Comedy,"Family, Sci-Fi"
22232,Cumali Ceber: Allah Seni Alsin,2017,Gökhan Gök,Halil Söyletmez,1.0,100,Not Rated,Gross Unkown,Comedy,Comedy
22233,Cumali Ceber: Allah Seni Alsin,2017,Gökhan Gök,Doga Konakoglu,1.0,100,Not Rated,Gross Unkown,Comedy,Comedy
22234,Cumali Ceber: Allah Seni Alsin,2017,Gökhan Gök,Emre Keskin,1.0,100,Not Rated,Gross Unkown,Comedy,Comedy


In [None]:
# Number of "Not Rated" (just curious)
imdb_df[imdb_df['Censor'] == 'Not Rated'].count()

Movie_Title      1980
Year             1980
Director         1980
Actors           1980
Rating           1980
Runtime(Mins)    1980
Censor           1980
Total_Gross      1980
main_genre       1980
side_genre       1980
dtype: int64

In [None]:
# Number of "Gross Unknown" (just curious)
imdb_df[imdb_df['Total_Gross'] == 'Gross Unkown']

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
0,Kantara,2022,Rishab Shetty,Rishab Shetty,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
1,Kantara,2022,Rishab Shetty,Sapthami Gowda,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
2,Kantara,2022,Rishab Shetty,Kishore Kumar G.,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
3,Kantara,2022,Rishab Shetty,Achyuth Kumar,9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
40,Seppuku,1962,Masaki Kobayashi,Tatsuya Nakadai,8.6,133,Not Rated,Gross Unkown,Action,"Drama, Mystery"
...,...,...,...,...,...,...,...,...,...,...
22215,Enes Batur Hayal mi Gerçek mi?,2018,Kamil Cetin,Fatih Yasin,2.0,110,Not Rated,Gross Unkown,Comedy,Comedy
22232,Cumali Ceber: Allah Seni Alsin,2017,Gökhan Gök,Halil Söyletmez,1.0,100,Not Rated,Gross Unkown,Comedy,Comedy
22233,Cumali Ceber: Allah Seni Alsin,2017,Gökhan Gök,Doga Konakoglu,1.0,100,Not Rated,Gross Unkown,Comedy,Comedy
22234,Cumali Ceber: Allah Seni Alsin,2017,Gökhan Gök,Emre Keskin,1.0,100,Not Rated,Gross Unkown,Comedy,Comedy


In [None]:
imdb_df = imdb_df[imdb_df['Total_Gross'] != 'Gross Unkown']
imdb_df

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
4,The Dark Knight,2008,Christopher Nolan,Christian Bale,9.0,152,UA,$534.86M,Action,"Crime, Drama"
5,The Dark Knight,2008,Christopher Nolan,Heath Ledger,9.0,152,UA,$534.86M,Action,"Crime, Drama"
6,The Dark Knight,2008,Christopher Nolan,Aaron Eckhart,9.0,152,UA,$534.86M,Action,"Crime, Drama"
7,The Dark Knight,2008,Christopher Nolan,Michael Caine,9.0,152,UA,$534.86M,Action,"Crime, Drama"
8,The Lord of the Rings: The Return of the King,2003,Peter Jackson,Elijah Wood,9.0,201,U,$377.85M,Action,"Adventure, Drama"
...,...,...,...,...,...,...,...,...,...,...
22227,From Justin to Kelly,2003,Robert Iscove,Anika Noni Rose,1.9,81,PG,$4.92M,Comedy,"Musical, Romance"
22228,Superbabies: Baby Geniuses 2,2004,Bob Clark,Jon Voight,1.5,88,PG,$9.11M,Comedy,"Family, Sci-Fi"
22229,Superbabies: Baby Geniuses 2,2004,Bob Clark,Scott Baio,1.5,88,PG,$9.11M,Comedy,"Family, Sci-Fi"
22230,Superbabies: Baby Geniuses 2,2004,Bob Clark,Vanessa Angel,1.5,88,PG,$9.11M,Comedy,"Family, Sci-Fi"


In [None]:
# Checking if they're all grossing in millions
imdb_df['Total_Gross'].str.contains('M').count()

18794

In [None]:
total_gross = imdb_df['Total_Gross'].str.replace('\$|M', '', regex=True)

4        534.86
5        534.86
6        534.86
7        534.86
8        377.85
          ...  
22227      4.92
22228      9.11
22229      9.11
22230      9.11
22231      9.11
Name: Total_Gross, Length: 18794, dtype: object

In [None]:
total_gross.count()

18794

In [None]:
imdb_df['Total_Gross'] = imdb_df['Total_Gross'].str.replace('\$|M', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df['Total_Gross'] = imdb_df['Total_Gross'].str.replace('\$|M', '', regex=True)


In [None]:
imdb_df['Total_Gross'] = imdb_df['Total_Gross'].astype(float)
imdb_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df['Total_Gross'] = imdb_df['Total_Gross'].astype(float)


Movie_Title       object
Year               int64
Director          object
Actors            object
Rating           float64
Runtime(Mins)      int64
Censor            object
Total_Gross      float64
main_genre        object
side_genre        object
dtype: object

In [None]:
imdb_df

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
4,The Dark Knight,2008,Christopher Nolan,Christian Bale,9.0,152,UA,534.86,Action,"Crime, Drama"
5,The Dark Knight,2008,Christopher Nolan,Heath Ledger,9.0,152,UA,534.86,Action,"Crime, Drama"
6,The Dark Knight,2008,Christopher Nolan,Aaron Eckhart,9.0,152,UA,534.86,Action,"Crime, Drama"
7,The Dark Knight,2008,Christopher Nolan,Michael Caine,9.0,152,UA,534.86,Action,"Crime, Drama"
8,The Lord of the Rings: The Return of the King,2003,Peter Jackson,Elijah Wood,9.0,201,U,377.85,Action,"Adventure, Drama"
...,...,...,...,...,...,...,...,...,...,...
22227,From Justin to Kelly,2003,Robert Iscove,Anika Noni Rose,1.9,81,PG,4.92,Comedy,"Musical, Romance"
22228,Superbabies: Baby Geniuses 2,2004,Bob Clark,Jon Voight,1.5,88,PG,9.11,Comedy,"Family, Sci-Fi"
22229,Superbabies: Baby Geniuses 2,2004,Bob Clark,Scott Baio,1.5,88,PG,9.11,Comedy,"Family, Sci-Fi"
22230,Superbabies: Baby Geniuses 2,2004,Bob Clark,Vanessa Angel,1.5,88,PG,9.11,Comedy,"Family, Sci-Fi"


In [None]:
imdb_df['Censor'].unique()

array(['UA', 'U', 'A', 'R', 'Not Rated', 'PG', 'PG-13', 'U/A', '7', '16',
       '18', '(Banned)', '13', '12+', 'UA 16+', '15+', 'UA 13+',
       'Unrated', 'G', 'M/PG', 'All', 'UA 7+', 'NC-17'], dtype=object)

In [None]:
imdb_df = imdb_df.reset_index(drop=True)

In [None]:
imdb_df.to_csv('imdb_expanded.csv')