# Data Cleaning File

## Imports the data and explores it
Steps will be:
- Ingest the datasets
- View the datasets
- Get an overview of the info
- Describe the numeric and non-numeric data

In [3]:
import sqlite3
import pandas as pd

### Resets the Data View

In [5]:
# TO SEE EVERY ROW AND COLUMN IN THE DATASET
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
#RUN THIS WHEN DONE SEEING EVERYTHING PLS
pd.reset_option('all')

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



### Ingests the datasets

In [41]:
# Ingests the datasets
bom_raw = pd.read_csv("../data/bom.movie_gross.csv.gz")
conn = sqlite3.connect("../data/im.db/im.db")
rt_movie_info_raw = pd.read_csv("../data/rt.movie_info.tsv.gz", delimiter="\t")
rt_reviews_raw = pd.read_csv("../data/rt.reviews.tsv.gz", delimiter="\t", encoding='cp1252')
tmdb_movies_raw = pd.read_csv("../data/tmdb.movies.csv.gz", index_col=0)
tn_movie_budgets_raw = pd.read_csv("../data/tn.movie_budgets.csv.gz")

In [11]:
# rt_review_data_raw initially doesn't load due to an encoding error. Here we detect the encoding type before inserting it into the codeblock above
with open("../data/rt.reviews.tsv.gz") as rt_reviews:
    print(rt_reviews)

<_io.TextIOWrapper name='../data/rt.reviews.tsv.gz' mode='r' encoding='cp1252'>


### Checks out the Datasets

#### Box Office Mojo Dataset

In [47]:
# Views out the dataset
bom_raw.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [33]:
# Provides an overview of the dataset
bom_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [14]:
# Describes the numeric and non-numeric data in the dataset
print(bom_raw.describe(),"\n")
print(bom_raw.describe(include=object))

       domestic_gross         year
count    3.359000e+03  3387.000000
mean     2.874585e+07  2013.958075
std      6.698250e+07     2.478141
min      1.000000e+02  2010.000000
25%      1.200000e+05  2012.000000
50%      1.400000e+06  2014.000000
75%      2.790000e+07  2016.000000
max      9.367000e+08  2018.000000 

            title studio foreign_gross
count        3387   3382          2037
unique       3386    257          1204
top     Bluebeard    IFC       1200000
freq            2    166            23


#### Imdb Dataset


In [48]:
# Views the Tables in the database
imdb_raw = pd.read_sql(
    """
    SELECT *
    FROM sqlite_master
    """
, conn
)
imdb_raw

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [13]:
# Get a list of all tables in the database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
imdb_tables = cursor.fetchall()

# Loop through each table and display an overview using pandas
for table_name in imdb_tables:
    df = pd.read_sql_query(f"SELECT * from {table_name[0]}", conn)
    print(f"Table: {table_name[0]}")
    print(df.head(2),"\n")

Table: movie_basics
    movie_id                    primary_title   original_title  start_year  \
0  tt0063540                        Sunghursh        Sunghursh        2013   
1  tt0066787  One Day Before the Rainy Season  Ashad Ka Ek Din        2019   

   runtime_minutes              genres  
0            175.0  Action,Crime,Drama  
1            114.0     Biography,Drama   

Table: directors
    movie_id  person_id
0  tt0285252  nm0899854
1  tt0462036  nm1940585 

Table: known_for
   person_id   movie_id
0  nm0061671  tt0837562
1  nm0061671  tt2398241 

Table: movie_akas
    movie_id  ordering              title region language        types  \
0  tt0369610        10      Джурасик свят     BG       bg         None   
1  tt0369610        11  Jurashikku warudo     JP     None  imdbDisplay   

  attributes  is_original_title  
0       None                0.0  
1       None                0.0   

Table: movie_ratings
     movie_id  averagerating  numvotes
0  tt10356526            8.3     

In [16]:
# Loop through each table and describes it numeric and non-numeric data
for table_name in imdb_tables:
    df = pd.read_sql_query(f"SELECT * from {table_name[0]}", conn)
    print(f"Table: {table_name[0]}")
    print(df.describe(),"\n")
    print(df.describe(include=object),"\n")

Table: movie_basics
          start_year  runtime_minutes
count  146144.000000    114405.000000
mean     2014.621798        86.187247
std         2.733583       166.360590
min      2010.000000         1.000000
25%      2012.000000        70.000000
50%      2015.000000        87.000000
75%      2017.000000        99.000000
max      2115.000000     51420.000000 

         movie_id primary_title original_title       genres
count      146144        146144         146123       140736
unique     146144        136071         137773         1085
top     tt0063540          Home         Broken  Documentary
freq            1            24             19        32185 

Table: directors
         movie_id  person_id
count      291174     291174
unique     140417     109253
top     tt4050462  nm6935209
freq         3818        238 

         movie_id  person_id
count      291174     291174
unique     140417     109253
top     tt4050462  nm6935209
freq         3818        238 

Table: known_for
      

#### Rotten Tomatoes Info and Reviews Datasets

In [49]:
# Views the dataset
rt_movie_info_raw.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [34]:
# Provides an overview of the dataset
rt_movie_info_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [18]:
# Describes the numeric and non-numeric data in the dataset
print(rt_movie_info_raw.describe(),"\n")
print(rt_movie_info_raw.describe(include=object))

                id
count  1560.000000
mean   1007.303846
std     579.164527
min       1.000000
25%     504.750000
50%    1007.500000
75%    1503.250000
max    2000.000000 

                                                 synopsis rating  genre  \
count                                                1498   1557   1552   
unique                                               1497      6    299   
top     A group of air crash survivors are stranded in...      R  Drama   
freq                                                    2    521    151   

                director       writer theater_date     dvd_date currency  \
count               1361         1111         1201         1201      340   
unique              1125         1069         1025          717        1   
top     Steven Spielberg  Woody Allen  Jan 1, 1987  Jun 1, 2004        $   
freq                  10            4            8           11      340   

       box_office     runtime              studio  
count         340 

In [178]:
# Checks out the dataset
rt_reviews_raw.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [35]:
# Provides an overview of the dataset
rt_reviews_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


In [19]:
# Describes the numeric and non-numeric data in the dataset
print(rt_reviews_raw.describe(),"\n")
print(rt_reviews_raw.describe(include=object))

                 id    top_critic
count  54432.000000  54432.000000
mean    1045.706882      0.240594
std      586.657046      0.427448
min        3.000000      0.000000
25%      542.000000      0.000000
50%     1083.000000      0.000000
75%     1541.000000      0.000000
max     2000.000000      1.000000 

                         review rating  fresh        critic        publisher  \
count                     48869  40915  54432         51710            54123   
unique                    48682    186      2          3496             1281   
top     Parental Content Review    3/5  fresh  Emanuel Levy  eFilmCritic.com   
freq                         24   4327  33035           595              673   

                   date  
count             54432  
unique             5963  
top     January 1, 2000  
freq               4303  


#### The Movie Database (TMDB) Dataset

In [50]:
# Views the dataset
tmdb_movies_raw.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [33]:
# Provides an overview of the dataset
tmdb_movies_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26517 entries, 0 to 26516
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genre_ids          26517 non-null  object 
 1   id                 26517 non-null  int64  
 2   original_language  26517 non-null  object 
 3   original_title     26517 non-null  object 
 4   popularity         26517 non-null  float64
 5   release_date       26517 non-null  object 
 6   title              26517 non-null  object 
 7   vote_average       26517 non-null  float64
 8   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


In [34]:
# Describes the numeric and non-numeric data in the dataset
print(tmdb_movies_raw.describe(),"\n")
print(tmdb_movies_raw.describe(include=object))

                  id    popularity  vote_average    vote_count
count   26517.000000  26517.000000  26517.000000  26517.000000
mean   295050.153260      3.130912      5.991281    194.224837
std    153661.615648      4.355229      1.852946    960.961095
min        27.000000      0.600000      0.000000      1.000000
25%    157851.000000      0.600000      5.000000      2.000000
50%    309581.000000      1.374000      6.000000      5.000000
75%    419542.000000      3.694000      7.000000     28.000000
max    608444.000000     80.773000     10.000000  22186.000000 

       genre_ids original_language original_title release_date  title
count      26517             26517          26517        26517  26517
unique      2477                76          24835         3433  24688
top         [99]                en           Eden   2010-01-01   Eden
freq        3700             23291              7          269      7


#### The Numbers Dataset

In [51]:
# Views the dataset
tn_movie_budgets_raw.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [44]:
# Provides an overview of the dataset
tn_movie_budgets_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [46]:
# Describes the numeric and non-numeric data in the dataset
print(tn_movie_budgets_raw.describe(),"\n")
print(tn_movie_budgets_raw.describe(include=object))

                id
count  5782.000000
mean     50.372363
std      28.821076
min       1.000000
25%      25.000000
50%      50.000000
75%      75.000000
max     100.000000 

        release_date      movie production_budget domestic_gross  \
count           5782       5782              5782           5782   
unique          2418       5698               509           5164   
top     Dec 31, 2014  Halloween       $20,000,000             $0   
freq              24          3               231            548   

       worldwide_gross  
count             5782  
unique            5356  
top                 $0  
freq               367  


## Cleans the Datasets

#### Box Office Mojo Dataset Cleaning

#### Imdb Dataset Cleaning


#### Rotten Tomatoes Info and Reviews Datasets

##### Begin Cleaning RT Reviews

***Observations***
* date needs to be a date dtype
* what is top critic?
* value counts of fresh column
* maybe change rating to a different dtype/format
* skim review for key words?
* I wonder how many ID's there are (1,135)

In [None]:
#This is here to check cleaning work done instead of re-writing repeatedly
rt_reviews_raw.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [None]:
rt_reviews_raw['date'] = pd.to_datetime(rt_reviews_raw['date'])

In [None]:
#Making a df for top critics
top_critics = rt_reviews_raw.loc[rt_reviews_raw['top_critic'] == 1]

In [None]:
top_critics.critic.value_counts()

Roger Ebert           461
James Berardinelli    348
Owen Gleiberman       210
Peter Travers         193
Mick LaSalle          166
                     ... 
Esther Zuckerman        1
Katherine Vu            1
Karen Heller            1
Sandy Cohen             1
Eric Brace              1
Name: critic, Length: 855, dtype: int64

There are 855 "top critics". I'm not sure this will be very helpful.

In [None]:
# making new column for the new ratings
rt_reviews_raw['new_rating'] = rt_reviews_raw['rating']

In [None]:
#removing + and - on letters
rt_reviews_raw.new_rating = rt_reviews_raw.new_rating.str.strip().str.replace('+', '').str.replace('-', '')

In [None]:
#replacing letters with consistent x/y format
rt_reviews_raw.new_rating = rt_reviews_raw.new_rating.str.replace("A", "5/5").str.replace("B", "4/5").str.replace("C", "3/5").str.replace("D", "2/5").str.replace("F", "1/5")

In [None]:
temp = rt_reviews_raw['new_rating'].str.split('/', expand=True)
temp[1].fillna(10, inplace=True)

In [None]:
#changing x/y to decimal numbers
temp[0].fillna(0, inplace=True)
temp[0] = temp[0].str.replace('N', '0')
temp[0] = temp[0].str.replace('R', '0')
temp[0] = temp[0].str.replace('T', '0')
temp[0] = temp[0].str.replace(' ', '.')
temp[0] = temp[0].astype(float)
temp[1] = temp[1].astype(float)
temp[2] = temp[0]/temp[1]
rt_reviews_raw['new_rating'] = temp[2]

In [None]:
rt_reviews_raw.new_rating.value_counts()

0.800000    7147
0.600000    6832
0.500000    4127
0.400000    4072
0.750000    3664
            ... 
1.550000       1
0.960000       1
0.433333       1
1.250000       1
0.310000       1
Name: new_rating, Length: 85, dtype: int64

? D: there are numbers greater than 1

In [None]:
count_table = rt_reviews_raw.new_rating.value_counts()

wrong_rating = count_table.index > 1

wrong_rating.sum()

5

In [None]:
count_table.loc[wrong_rating]

1.05    9
1.10    4
1.50    2
1.55    1
1.25    1
Name: new_rating, dtype: int64

In [None]:
#creating new df cleaned for naming
rt_reviews_cleaned = rt_reviews_raw.loc[rt_reviews_raw['new_rating'] <= 1]

In [None]:
rt_reviews_cleaned.to_csv(' .csv', index=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40898 entries, 0 to 54431
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          40898 non-null  int64         
 1   review      35365 non-null  object        
 2   rating      40898 non-null  object        
 3   fresh       40898 non-null  object        
 4   critic      38919 non-null  object        
 5   top_critic  40898 non-null  int64         
 6   publisher   40671 non-null  object        
 7   date        40898 non-null  datetime64[ns]
 8   new_rating  40898 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 3.1+ MB


Rating col is filled in~!

In [None]:
rt_ratings = rt_reviews_cleaned.groupby(['id'], as_index=False)['new_rating'].mean()
rt_ratings
#checking rotten tomatoes rating grouped by id mean

Unnamed: 0,id,new_rating
0,3,0.628097
1,5,0.685000
2,6,0.592683
3,8,0.703750
4,10,0.579918
...,...,...
1110,1996,0.636304
1111,1997,0.498913
1112,1998,0.600000
1113,1999,0.588710


#### The Movie Database (TMDB) Dataset Cleaning

#### The Numbers Dataset Dataset Cleaning

##### The Numbers Database Cleaning

In [None]:
rot_tom.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1115 entries, 0 to 1114
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            1115 non-null   int64  
 1   new_rating    1115 non-null   float64
 2   synopsis      1099 non-null   object 
 3   rated         1114 non-null   object 
 4   genre         1114 non-null   object 
 5   director      997 non-null    object 
 6   writer        879 non-null    object 
 7   theater_date  986 non-null    object 
 8   dvd_date      986 non-null    object 
 9   currency      299 non-null    object 
 10  box_office    299 non-null    object 
 11  runtime       1115 non-null   float64
 12  studio        412 non-null    object 
dtypes: float64(2), int64(1), object(10)
memory usage: 122.0+ KB


In [None]:
rot_tom.groupby('id').first()
#checking the group by


Unnamed: 0_level_0,new_rating,synopsis,rated,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,0.628097,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108.0,Entertainment One
5,0.685000,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116.0,
6,0.592683,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128.0,
8,0.703750,The year is 1942. As the Allies unite overseas...,PG,Drama|Kids and Family,Jay Russell,Gail Gilchriest,"Mar 3, 2000","Jul 11, 2000",,,95.0,Warner Bros. Pictures
10,0.579918,Some cast and crew from NBC's highly acclaimed...,PG-13,Comedy,Jake Kasdan,Mike White,"Jan 11, 2002","Jun 18, 2002",$,41032915,82.0,Paramount Pictures
...,...,...,...,...,...,...,...,...,...,...,...,...
1996,0.636304,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,"Aug 18, 2006","Jan 2, 2007",$,33886034,106.0,New Line Cinema
1997,0.498913,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,"Jul 23, 1993","Apr 17, 2001",,,88.0,Paramount Vantage
1998,0.600000,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,,"Jan 1, 1962","May 11, 2004",,,111.0,
1999,0.588710,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,"Apr 1, 1993","Jan 29, 2002",,,101.0,


***Observations***
* there are no nulls!
* release_date needs to be a date type
* Are there duplicate movie names?
* The production budget, domestic_gross, and worldwide_gross should be converted to integers

In [None]:
#checking work
tn_movie_budgets_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [None]:
tn_movie_budgets_raw.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [None]:
# change release_date to datetime
tn_movie_budgets_raw['release_date'] = pd.to_datetime(tn_movie_budgets_raw['release_date'])

In [None]:
# taking out $ and , in numbers
tn_movie_budgets_raw['production_budget'] = tn_movie_budgets_raw['production_budget'].str.replace('$', '').str.replace(',', '')
tn_movie_budgets_raw['domestic_gross'] = tn_movie_budgets_raw['domestic_gross'].str.replace('$', '').str.replace(',', '')
tn_movie_budgets_raw['worldwide_gross'] = tn_movie_budgets_raw['worldwide_gross'].str.replace('$', '').str.replace(',', '')

In [None]:
# casting production_budget as integer
tn_movie_budgets_raw['production_budget'] = tn_movie_budgets_raw['production_budget'].astype(float)
tn_movie_budgets_raw['domestic_gross'] = tn_movie_budgets_raw['domestic_gross'].astype(float)
tn_movie_budgets_raw['worldwide_gross'] = tn_movie_budgets_raw['worldwide_gross'].astype(float)

In [None]:
# creating new df called clean for consistency
tn_movie_budgets_cleaned = tn_movie_budgets_raw