Import libraries

In [106]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Database exploration:

In [113]:
df = pd.read_csv(r'imdb_movies.csv')
df.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


There's no ID value, so I'll add the index as a column.

In [109]:
df['ID']= df.index

I also noted this particular movie doesn't have a genre, so I will add it manually.

In [110]:
df[df['names']=="Moses, Prince of Egypt"]["genre"] 

df.loc[df['ID'] == 2020, 'genre'] = "Animation"

Now I'll drop every movie without a genre.

In [126]:
df = df.dropna(subset=['genre'])

Duplicates

In [127]:
df = df.drop_duplicates()

Movies with the same overview, which have to be the same:

In [128]:
df = df.drop_duplicates(subset=['overview'], keep='first')

In [125]:
df[df["names"].isin(df["names"][df["names"].duplicated()])].sort_values(by='names')

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
2072,12 Angry Men,10/10/1957,85.0,Drama,The defense and the prosecution have rested an...,"Martin Balsam, Juror 1, John Fiedler, Juror 2,...",12 Angry Men,Released,English,340000.0,379.0,AU
9741,12 Angry Men,08/17/1997,77.0,"Crime, Drama, TV Movie",During the trial of a man accused of his fathe...,"Courtney B. Vance, Foreman, Ossie Davis, Juror...",12 Angry Men,Released,English,340000.0,379.0,US
3028,A Christmas Carol,11/05/2009,69.0,"Animation, Family, Drama, Fantasy",Miser Ebenezer Scrooge is awakened on Christma...,"Jim Carrey, Scrooge / Ghosts of Christmas (voi...",A Christmas Carol,Released,English,200000000.0,325286646.0,AU
7213,A Christmas Carol,10/09/1984,74.0,"Drama, Fantasy, TV Movie, Family",A bitter old miser who makes excuses for his u...,"George C. Scott, Ebenezer Scrooge, Roger Rees,...",A Christmas Carol,Released,English,104200000.0,379238515.8,GB
2373,A Nightmare on Elm Street,08/01/1985,73.0,Horror,Teenagers in a small town are dropping like fl...,"John Saxon, Lieutenant Donald Thompson, Ronee ...",A Nightmare on Elm Street,Released,English,35000000.0,117729618.0,AU
...,...,...,...,...,...,...,...,...,...,...,...,...
924,Wrong Turn,07/31/2003,63.0,"Horror, Thriller",Chris crashes into a carload of other young pe...,"Eliza Dushku, Jessie Burlingame, Emmanuelle Ch...",Wrong Turn,Released,English,10000000.0,28649556.0,AU
8858,Wuthering Heights,10/16/1992,67.0,"Drama, Romance",Young orphan Heathcliff is adopted by the weal...,"Juliette Binoche, Catherine Linton, Ralph Fien...",Wuthering Heights,Released,English,8000000.0,2721534.0,US
9749,Wuthering Heights,10/11/2012,59.0,"Drama, Romance","Yorkshire moorlands, northern England, in the ...","Kaya Scodelario, Older Cathy, James Howson, Ol...",Wuthering Heights,Released,English,8000000.0,2721534.0,AU
4326,X,11/23/2011,65.0,"Action, Thriller, Romance",A veteran call girl and a runaway prostitute w...,"Viva Bianca, Holly Rowe, Hanna Mangan Lawrence...",X,Released,English,1000000.0,15113105.0,AU


There are still movies with the same name, but they can correspond to different movies.

Now I can change the dates to datetime format:

In [135]:
df['date_x'] = pd.to_datetime(df['date_x'].str.strip(), format='%m/%d/%Y', errors='coerce')

In [136]:
df

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,2023-03-02,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,2.716167e+08,AU
1,Avatar: The Way of Water,2022-12-15,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2.316795e+09,AU
2,The Super Mario Bros. Movie,2023-04-05,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,7.244590e+08,AU
3,Mummies,2023-01-05,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,3.420000e+07,AU
4,Supercell,2023-03-17,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,3.409420e+08,US
...,...,...,...,...,...,...,...,...,...,...,...,...
10173,20th Century Women,2016-12-28,73.0,Drama,"In 1979 Santa Barbara, California, Dorothea Fi...","Annette Bening, Dorothea Fields, Lucas Jade Zu...",20th Century Women,Released,English,7000000.0,9.353729e+06,US
10174,Delta Force 2: The Colombian Connection,1990-08-24,54.0,Action,When DEA agents are taken captive by a ruthles...,"Chuck Norris, Col. Scott McCoy, Billy Drago, R...",Delta Force 2: The Colombian Connection,Released,English,9145817.8,6.698361e+06,US
10175,The Russia House,1990-12-21,61.0,"Drama, Thriller, Romance","Barley Scott Blair, a Lisbon-based editor of R...","Sean Connery, Bartholomew 'Barley' Scott Blair...",The Russia House,Released,English,21800000.0,2.299799e+07,US
10176,Darkman II: The Return of Durant,1995-07-11,55.0,"Action, Adventure, Science Fiction, Thriller, ...",Darkman and Durant return and they hate each o...,"Larry Drake, Robert G. Durant, Arnold Vosloo, ...",Darkman II: The Return of Durant,Released,English,116000000.0,4.756613e+08,US


In [138]:
pd.read_xml(r"C:\Users\julie\Downloads\sample_book.xml")

Unnamed: 0,authentication,key,method,id,title,isbn,isbn13,asin,kindle_asin,marketplace_id,...,text_reviews_count,url,link,authors,reviews_widget,popular_shelves,book_links,buy_links,series_works,similar_books
0,True,all_men_must_die,book_show,,,,,,,,...,,,,,,,,,,
1,,,,205330.0,There Was an Old Lady Who Swallowed a Fly,670869392.0,9780671000000.0,,,,...,716.0,https://www.goodreads.com/book/show/205330.The...,https://www.goodreads.com/book/show/205330.The...,,<style>\n #goodreads-widget {\n font-famil...,,,,,
