In [2]:
import numpy as np
import pandas as pd
import imdb

In [3]:
Movies_ = pd.read_csv('/Users/bader/Desktop/Movies_2000_fixedCast.csv')

## Fixing certificates

In [4]:
Movies_.columns

Index(['year', 'name', 'Movie_ID', 'runtime', 'cast', 'genre', 'country_codes',
       'language_codes', 'certificates', 'Budget', 'OWUS', 'CWG', 'rating',
       'votes', 'original_air_date', 'title', 'director', 'writer1', 'writer2',
       'producer1', 'producer2', 'production_company1', 'production_company2',
       'composers', 'editors', 'cinematographers', 'set_decorators',
       'assistant_directors', 'visual_effects'],
      dtype='object')

A G rating stands for “General Audiences,” and means the film contains nothing that should offend parents or children. A PG rating stands for “Parental Guidance,” and is a gentle warning that there may be some content that parents might not want younger kids to see. PG-13 is a stronger caution, warning parents that children under 13 may be too young to deal with some of the content, and strongly advising parents to use their discretion. An “R” rating stands for “Restricted,” and participating movie theaters will not admit children under the age of 17 without a parent or guardian. “R” movies tend to contain more graphic violence, adult situations, and graphic language, and the MPAA's rating is intended to strongly urge parents to avoid bringing children to see this movie given its mature content. Finally, an “NC-17” rating stands for “No Children Under 17,” and is the strongest warning the MPAA currently issues. Participating theaters will not admit anyone under 17, regardless of whether accompanied by a parent or not. These films are considered far too graphic for children and, in fact, may offend some adults, as well.


- **R — R is a film rating level (from the MPAA). It is below NC-17, and above PG-13. ... R and TV-MA are fairly similar to how they're rated. The elements include language, violence, substance abuse, nudity, and sexual content.**


- **unrated — In new movies, a DVD that says "unrated" means scenes have been added to the film that were not seen in the "rated" version. The new scenes, having not been rated, could contain anything from violence to nudity to language to boring dialogue that was cut from the theatrical version.**


- **TV–G — Suitable for all ages; little or no violence, strong language. or sexual situations.**


- **TV–PG — Parental guidance suggested; may be unsuitable for younger children.**


- **TV–14 — Parents strongly cautioned; may be unsuitable for children under 14.**

- **TV-Y — This rating means that programming is appropriate for children of all ages. TV-Y7 indicates that programming is designed for chil- dren ages 7 and older.**

- **The definition of N-17 is a movie or film rating indicating that the movie contains adult content and should not be seen by people under the age of 17.**



#### ( R  [TV-MA ,  ]
#### (G, [TV–G , TV-Y ,  
#### (PG , [TV-Y7]
#### (PG-13, [TV-14 , ]
#### (Unrated [Not Rated
#### (Approved


In [5]:
print(Movies_['certificates'].shape[0])
Movies_['certificates'].isnull().sum()

418


131

In [6]:
Movies_['certificates'].value_counts()

R                             114
PG-13                          58
Not Rated                      32
PG                             17
G                              10
Unrated                         7
TV-14                           7
TV-14::(DLSV, TV Rating.)       4
TV-PG                           4
TV-14::(TV rating)              4
TV-MA::(cable rating)           3
TV-14::(DLS, TV Rating.)        2
Approved                        2
TV-MA::(TV rating)              2
TV-PG::(DL, TV Rating.)         2
R::(certificate # 37343)        1
TV-PG::(DLS, TV rating)         1
R::(Approved No. 37152)         1
TV-G::(Nickelodeon)             1
R::(certificate no. 37918)      1
R::(NO. 37533)                  1
PG::(No. 37187)                 1
R::(No. 37609)                  1
R::(#37352)                     1
TV-MA::(cable television)       1
TV-14::(LSV, TV Rating.)        1
TV-MA                           1
TV-PG::(LV)                     1
TV-PG::(Cartoon Network)        1
TV-Y          

In [7]:
Movies_['certificates'].value_counts()

# R,PG-13,Unrated,PG,G,

Movies_['Rating'] = Movies_['certificates'].apply(lambda x: x.split(':')[0] if len(str(x)) > 10 else x)


In [8]:
Movies_['Rating'].value_counts()

R            121
PG-13         59
Not Rated     32
PG            19
TV-14         18
G             10
TV-PG          9
TV-MA          7
Unrated        7
Approved       2
TV-Y7          1
TV-Y           1
TV-G           1
Name: Rating, dtype: int64

In [9]:
#main_features = ['R','G','PG','PG-13','Unrated']
cert_map = {'R':'R','TV-MA':'R','G':'G','TV–G':'G','TV-Y':'G','PG':'PG',
            'TV-Y7':'PG','PG-13':'PG-13','TV-14':'PG-13','Unrated':'Unrated','Not Rated':'Unrated'}

Movies_['Rating'] = Movies_['Rating'].map(cert_map)

In [10]:
Movies_['Rating'].value_counts()

R          128
PG-13       77
Unrated     39
PG          20
G           11
Name: Rating, dtype: int64

In [11]:
Movies_.head()

Unnamed: 0,year,name,Movie_ID,runtime,cast,genre,country_codes,language_codes,certificates,Budget,...,producer2,production_company1,production_company2,composers,editors,cinematographers,set_decorators,assistant_directors,visual_effects,Rating
0,2000,Charlie's Angels,5033998,118.0,"['Kristen Stewart', 'Naomi Scott', 'Ella Balin...","['Action', 'Adventure', 'Comedy']",['us'],['en'],PG-13,48000000.0,...,Drew Barrymore,Columbia Pictures,2.0 Entertainment,Brian Tyler,Alan Baumgarten,Bill Pope,Mark Rosinski,Simon Adegbenro,Lukas Abraham,PG-13
1,2000,Gladiator,172495,155.0,"['Russell Crowe', 'Joaquin Phoenix', 'Connie N...","['Action', 'Adventure', 'Drama']","['us', 'gb', 'mt', 'ma']",['en'],TV-14,103000000.0,...,Branko Lustig,DreamWorks,Universal Pictures,Lisa Gerrard,Pietro Scalia,John Mathieson,Crispian Sallis,Ali Cherkaoui,Rob Allman,PG-13
2,2000,How the Grinch Stole Christmas,2709692,85.0,"['Benedict Cumberbatch', 'Cameron Seely', 'Ras...","['Animation', 'Family', 'Fantasy']","['fr', 'jp', 'us']",['en'],PG,75000000.0,...,Janet Healy,Universal Pictures,Universal Animation Studios,Danny Elfman,Chris Cartagena,,,,Richard Adenot,PG
3,2000,Requiem for a Dream,180093,102.0,"['Ellen Burstyn', 'Jared Leto', 'Jennifer Conn...",['Drama'],['us'],['en'],Unrated,4500000.0,...,Beau Flynn,Artisan Entertainment,Thousand Words,Clint Mansell,Jay Rabinowitz,Matthew Libatique,Ondine Karady,Timothy Bird,Jeremy Dawson,Unrated
4,2000,Remember the Titans,210945,113.0,"['Denzel Washington', 'Will Patton', 'Wood Har...","['Biography', 'Drama', 'Sport']",['us'],['en'],PG,30000000.0,...,Michael Flynn,Jerry Bruckheimer Films,Run It Up Productions Inc.,Trevor Rabin,Michael Tronick,Philippe Rousselot,Anne Kuljian,Matthew Feitshans,Andy Davis,PG


## Fixing Original Air_Date

In [20]:
Movies_.columns

Index(['year', 'name', 'Movie_ID', 'runtime', 'cast', 'genre', 'country_codes',
       'language_codes', 'certificates', 'Budget', 'OWUS', 'CWG', 'rating',
       'votes', 'original_air_date', 'title', 'director', 'writer1', 'writer2',
       'producer1', 'producer2', 'production_company1', 'production_company2',
       'composers', 'editors', 'cinematographers', 'set_decorators',
       'assistant_directors', 'visual_effects', 'Rating'],
      dtype='object')

In [25]:
Movies_[['year','original_air_date']]

Unnamed: 0,year,original_air_date
0,2000,14 Nov 2019 (Saudi Arabia)
1,2000,04 May 2000 (Australia)
2,2000,08 Nov 2018 (Saudi Arabia)
3,2000,03 Nov 2000 (Canada)
4,2000,29 Sep 2000 (USA)
...,...,...
413,2000,28 Sep 2000 (Japan)
414,2000,28 Oct 2000 (Japan)
415,2000,02 Feb 2015 (Iran)
416,2000,


In [69]:
Movies_['Date'] = Movies_['original_air_date'].apply(lambda x : x.split('(')[0] if type(x) != float else x)
Movies_['Date'] = pd.to_datetime(Movies_['Date'])
Movies_['Air_location'] = Movies_['original_air_date'].apply(lambda x : x.split('(')[1].strip(')') if type(x) != float else x)


In [70]:
Movies_['Date'] = Movies_['Date'].apply(lambda x : x.date())

In [89]:
Movies_['name'][0]

"Charlie's Angels"

In [107]:
not_equil = []
equil = []

for i in range(0,Movies_['name'].shape[0]):
    if Movies_['Date'][i].year == Movies_['year'][i]:
        equil.append(Movies_['name'][i])
    else:
        not_equil.append(Movies_['name'][i])
        

In [94]:
not_equil

["Charlie's Angels",
 'How the Grinch Stole Christmas',
 'X-Men',
 'Scary Movie',
 'Chocolat',
 'The Family Man',
 'Traffic',
 'Shaft',
 'Sexy Beast',
 'Ginger Snaps',
 'The Gift',
 'The Kid',
 'Red Planet',
 'Sous le sable',
 'Rules of Engagement',
 'Finding Forrester',
 'The Watcher',
 'Tigerland',
 'Pollock',
 'Bounce',
 'Waking the Dead',
 'Before Night Falls',
 'Werckmeister harmóniák',
 'Saving Grace',
 'Hamlet',
 'The Weight of Water',
 'Bait',
 'Psycho Beach Party',
 'Faust: Love of the Damned',
 'O Fantasma',
 'Dancing at the Blue Iguana',
 'The Claim',
 'Cut',
 'Citizen Toxie: The Toxic Avenger IV',
 'State and Main',
 'Chinese Coffee',
 "The Monkey's Mask",
 'Skipped Parts',
 'Sunset Strip',
 "Harrison's Flowers",
 'Love, Honour and Obey',
 'Screwed',
 'Brother',
 'New York Beat Movie',
 'The Specials',
 'The Rowdy Girls',
 'Maybe Baby',
 'Beautiful',
 'Sordid Lives',
 'Poor White Trash',
 'Born Romantic',
 'Beautiful Creatures',
 'Gun Shy',
 'Happy Accidents',
 'Nostradamus

In [99]:
test = Movies_[['Date','name','year','Movie_ID']]
test['Time_Equal'] = test['name'].apply(lambda x: 0 if x in not_equil else 1)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [105]:
test[test['Time_Equal'] == 0]

Unnamed: 0,Date,name,year,Movie_ID,Time_Equal
0,2019-11-14,Charlie's Angels,2000,5033998,0
2,2018-11-08,How the Grinch Stole Christmas,2000,2709692,0
10,NaT,X-Men,2000,103584,0
12,1996-12-20,Scary Movie,2000,117571,0
22,2001-01-19,Chocolat,2000,241303,0
...,...,...,...,...,...
411,NaT,Maguna,2000,2083257,0
412,NaT,Timeless,2000,5511582,0
415,2015-02-02,Wars and Treasure,2000,7201596,0
416,NaT,National Japanese American Memorial: Groundbre...,2000,8290150,0


In [93]:
len(equil)

271

In [45]:
Movies_['Air_location']

0      Saudi Arabia
1         Australia
2      Saudi Arabia
3            Canada
4               USA
           ...     
413           Japan
414           Japan
415            Iran
416             NaN
417           India
Name: Air_location, Length: 418, dtype: object

In [47]:
for i in Movies_['Air_location']:
    print(i)

Saudi Arabia
Australia
Saudi Arabia
Canada
USA
USA
UK
USA
France
USA
nan
USA
USA
Italy
USA
Canada
Malta
USA
Romania
UK
France
China
USA
nan
USA
USA
Canada
New Zealand
USA
Hong Kong
USA
Canada
Hong Kong
Canada
Italy
Canada
USA
Malaysia
USA
USA
Canada
USA
Japan
Denmark
Canada
Romania
USA
Malaysia
Brazil
USA
Canada
USA
Malaysia
UK
Australia
USA
Canada
Lithuania
Mexico
USA
USA
USA
USA
USA
USA
USA
USA
USA
USA
USA
USA
USA
Japan
USA
USA
USA
USA
Canada
Italy
USA
Australia
USA
Malaysia
Romania
Germany
USA
nan
Denmark
USA
France
USA
Belgium
Belgium
USA
USA
India
USA
Canada
USA
USA
nan
USA
USA
Singapore
Sweden
South Korea
USA
Australia
USA
Slovenia
USA
USA
USA
USA
USA
Spain
USA
UK
France
USA
France
Sweden
USA
France
Italy
USA
India
nan
Puerto Rico
USA
nan
Sweden
Canada
Hungary
Italy
nan
South Korea
Greece
USA
USA
Argentina
UK
USA
France
Belgium
Italy
Hong Kong
USA
USA
France
USA
USA
Italy
France
Italy
USA
Spain
USA
India
USA
Australia
Italy
USA
France
USA
USA
UK
Denmark
Aruba
Germany
Israel
Portu

In [59]:
Movies_['Date'] = pd.to_datetime(Movies_['Date'])

In [60]:
Movies_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 32 columns):
year                   418 non-null int64
name                   418 non-null object
Movie_ID               418 non-null int64
runtime                388 non-null float64
cast                   408 non-null object
genre                  418 non-null object
country_codes          418 non-null object
language_codes         407 non-null object
certificates           287 non-null object
Budget                 184 non-null float64
OWUS                   138 non-null object
CWG                    81 non-null float64
rating                 376 non-null float64
votes                  376 non-null float64
original_air_date      356 non-null object
title                  418 non-null object
director               398 non-null object
writer1                380 non-null object
writer2                74 non-null object
producer1              372 non-null object
producer2              340 non-nul